Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Paused

App Files Files Community

MagicMeWizard commited on Jul 1

Commit

a4ca225

verified ·

1 Parent(s): e5de17a

Update app.py

Browse files

Files changed (1) hide show

app.py +1210 -525

app.py CHANGED Viewed

@@ -1,149 +1,146 @@
 """
-AI-Powered Web Scraper - app.py
-Professional-grade web content extraction and AI summarization tool for Hugging Face Spaces
 """
 import gradio as gr
-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
 import pandas as pd
-from datetime import datetime
 import json
 import re
-import time
-from typing import List, Dict, Optional, Tuple
 import logging
 from pathlib import Path
-import os
-from dataclasses import dataclass
-from transformers import pipeline
-import nltk
-from nltk.tokenize import sent_tokenize
-import asyncio
-import aiohttp
-from concurrent.futures import ThreadPoolExecutor
 import hashlib
-# Download required NLTK data
 try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt', quiet=True)
 # Configure logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 @dataclass
-class ScrapedContent:
-    """Data class for scraped content with metadata"""
     url: str
     title: str
     content: str
-    summary: str
     word_count: int
-    reading_time: int
-    extracted_at: str
-    author: Optional[str] = None
-    publish_date: Optional[str] = None
-    meta_description: Optional[str] = None
-    keywords: List[str] = None
-class SecurityValidator:
-    """Security validation for URLs and content"""
-    ALLOWED_SCHEMES = {'http', 'https'}
-    BLOCKED_DOMAINS = {
-        'localhost', '127.0.0.1', '0.0.0.0',
-        '192.168.', '10.', '172.16.', '172.17.',
-        '172.18.', '172.19.', '172.20.', '172.21.',
-        '172.22.', '172.23.', '172.24.', '172.25.',
-        '172.26.', '172.27.', '172.28.', '172.29.',
-        '172.30.', '172.31.'
-    }
-    @classmethod
-    def validate_url(cls, url: str) -> Tuple[bool, str]:
-        """Validate URL for security concerns"""
-        try:
-            parsed = urlparse(url)
-            # Check scheme
-            if parsed.scheme not in cls.ALLOWED_SCHEMES:
-                return False, f"Invalid scheme: {parsed.scheme}. Only HTTP/HTTPS allowed."
-            # Check for blocked domains
-            hostname = parsed.hostname or ''
-            if any(blocked in hostname for blocked in cls.BLOCKED_DOMAINS):
-                return False, "Access to internal/local networks is not allowed."
-            # Basic malformed URL check
-            if not parsed.netloc:
-                return False, "Invalid URL format."
-            return True, "URL is valid."
-        except Exception as e:
-            return False, f"URL validation error: {str(e)}"
-class RobotsTxtChecker:
-    """Check robots.txt compliance"""
-    @staticmethod
-    def can_fetch(url: str, user_agent: str = "*") -> bool:
-        """Check if URL can be fetched according to robots.txt"""
-        try:
-            parsed_url = urlparse(url)
-            robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
-            response = requests.get(robots_url, timeout=5)
-            if response.status_code == 200:
-                # Simple robots.txt parsing (basic implementation)
-                lines = response.text.split('\n')
-                user_agent_section = False
-                for line in lines:
-                    line = line.strip()
-                    if line.startswith('User-agent:'):
-                        agent = line.split(':', 1)[1].strip()
-                        user_agent_section = agent == '*' or agent.lower() == user_agent.lower()
-                    elif user_agent_section and line.startswith('Disallow:'):
-                        disallowed = line.split(':', 1)[1].strip()
-                        if disallowed and url.endswith(disallowed):
-                            return False
-            return True
-        except Exception:
-            # If robots.txt can't be fetched, assume allowed
-            return True
-class ContentExtractor:
-    """Advanced content extraction with multiple strategies"""
     def __init__(self):
         self.session = requests.Session()
         self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (compatible; AI-WebScraper/1.0; Research Tool)',
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate',
             'Connection': 'keep-alive',
-            'Upgrade-Insecure-Requests': '1',
         })
-    def extract_content(self, url: str) -> Optional[ScrapedContent]:
-        """Extract content from URL with robust error handling"""
         try:
-            # Security validation
-            is_valid, validation_msg = SecurityValidator.validate_url(url)
-            if not is_valid:
-                raise ValueError(f"Security validation failed: {validation_msg}")
-            # Check robots.txt
-            if not RobotsTxtChecker.can_fetch(url):
-                raise ValueError("robots.txt disallows scraping this URL")
             # Fetch content
             response = self.session.get(url, timeout=15)
@@ -152,125 +149,82 @@ class ContentExtractor:
             # Parse HTML
             soup = BeautifulSoup(response.content, 'html.parser')
-            # Extract metadata
             title = self._extract_title(soup)
-            author = self._extract_author(soup)
-            publish_date = self._extract_publish_date(soup)
-            meta_description = self._extract_meta_description(soup)
-            # Extract main content
-            content = self._extract_main_content(soup)
-            if not content or len(content.strip()) < 100:
-                raise ValueError("Insufficient content extracted")
-            # Calculate metrics
-            word_count = len(content.split())
-            reading_time = max(1, word_count // 200)  # Average reading speed
-            # Extract keywords
-            keywords = self._extract_keywords(content)
-            return ScrapedContent(
                 url=url,
                 title=title,
                 content=content,
-                summary="",  # Will be filled by AI summarizer
-                word_count=word_count,
-                reading_time=reading_time,
-                extracted_at=datetime.now().isoformat(),
-                author=author,
-                publish_date=publish_date,
-                meta_description=meta_description,
-                keywords=keywords
             )
         except Exception as e:
-            logger.error(f"Content extraction failed for {url}: {str(e)}")
-            raise
-    def _extract_title(self, soup: BeautifulSoup) -> str:
-        """Extract page title with fallbacks"""
-        # Try meta og:title first
-        og_title = soup.find('meta', property='og:title')
-        if og_title and og_title.get('content'):
-            return og_title['content'].strip()
-        # Try regular title tag
-        title_tag = soup.find('title')
-        if title_tag:
-            return title_tag.get_text().strip()
-        # Try h1 as fallback
-        h1_tag = soup.find('h1')
-        if h1_tag:
-            return h1_tag.get_text().strip()
-        return "No title found"
-    def _extract_author(self, soup: BeautifulSoup) -> Optional[str]:
-        """Extract author information"""
-        # Try multiple selectors for author
-        author_selectors = [
-            'meta[name="author"]',
-            'meta[property="article:author"]',
-            '.author',
-            '.byline',
-            '[rel="author"]'
-        ]
-        for selector in author_selectors:
-            element = soup.select_one(selector)
-            if element:
-                if element.name == 'meta':
-                    return element.get('content', '').strip()
-                else:
-                    return element.get_text().strip()
-        return None
-    def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]:
-        """Extract publication date"""
-        date_selectors = [
-            'meta[property="article:published_time"]',
-            'meta[name="publishdate"]',
-            'time[datetime]',
-            '.publish-date',
-            '.date'
         ]
-        for selector in date_selectors:
             element = soup.select_one(selector)
             if element:
                 if element.name == 'meta':
                     return element.get('content', '').strip()
-                elif element.name == 'time':
-                    return element.get('datetime', '').strip()
                 else:
                     return element.get_text().strip()
-        return None
-    def _extract_meta_description(self, soup: BeautifulSoup) -> Optional[str]:
-        """Extract meta description"""
-        meta_desc = soup.find('meta', attrs={'name': 'description'})
-        if meta_desc:
-            return meta_desc.get('content', '').strip()
-        og_desc = soup.find('meta', property='og:description')
-        if og_desc:
-            return og_desc.get('content', '').strip()
-        return None
-    def _extract_main_content(self, soup: BeautifulSoup) -> str:
-        """Extract main content with multiple strategies"""
         # Remove unwanted elements
-        for element in soup(['script', 'style', 'nav', 'header', 'footer',
-                           'aside', 'advertisement', '.ads', '.sidebar']):
             element.decompose()
-        # Try content-specific selectors first
         content_selectors = [
             'article',
             'main',
@@ -278,424 +232,1155 @@ class ContentExtractor:
             '.post-content',
             '.entry-content',
             '.article-body',
-            '#content',
-            '.story-body'
         ]
         for selector in content_selectors:
             element = soup.select_one(selector)
             if element:
                 text = element.get_text(separator=' ', strip=True)
-                if len(text) > 200:  # Minimum content threshold
                     return self._clean_text(text)
-        # Fallback: extract from body
         body = soup.find('body')
         if body:
-            text = body.get_text(separator=' ', strip=True)
-            return self._clean_text(text)
-        # Last resort: all text
         return self._clean_text(soup.get_text(separator=' ', strip=True))
     def _clean_text(self, text: str) -> str:
         """Clean extracted text"""
         # Remove extra whitespace
         text = re.sub(r'\s+', ' ', text)
-        # Remove common unwanted patterns
-        text = re.sub(r'Subscribe.*?newsletter', '', text, flags=re.IGNORECASE)
-        text = re.sub(r'Click here.*?more', '', text, flags=re.IGNORECASE)
-        text = re.sub(r'Advertisement', '', text, flags=re.IGNORECASE)
         return text.strip()
-    def _extract_keywords(self, content: str) -> List[str]:
-        """Extract basic keywords from content"""
-        # Simple keyword extraction (can be enhanced with NLP)
-        words = re.findall(r'\b[A-Za-z]{4,}\b', content.lower())
-        word_freq = {}
-        for word in words:
-            if word not in ['that', 'this', 'with', 'from', 'they', 'have', 'been', 'were', 'said']:
-                word_freq[word] = word_freq.get(word, 0) + 1
-        # Return top 10 keywords
-        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
-        return [word for word, freq in sorted_words[:10]]
-class AISummarizer:
-    """AI-powered content summarization"""
     def __init__(self):
-        self.summarizer = None
-        self._load_model()
-    def _load_model(self):
-        """Load summarization model with error handling"""
         try:
-            self.summarizer = pipeline(
-                "summarization",
-                model="facebook/bart-large-cnn",
-                tokenizer="facebook/bart-large-cnn"
             )
-            logger.info("Summarization model loaded successfully")
         except Exception as e:
-            logger.error(f"Failed to load summarization model: {e}")
-            # Fallback to a smaller model
-            try:
-                self.summarizer = pipeline(
-                    "summarization",
-                    model="sshleifer/distilbart-cnn-12-6"
-                )
-                logger.info("Fallback summarization model loaded")
-            except Exception as e2:
-                logger.error(f"Failed to load fallback model: {e2}")
-                self.summarizer = None
-    def summarize(self, content: str, max_length: int = 300) -> str:
-        """Generate AI summary of content"""
-        if not self.summarizer:
-            return self._extractive_summary(content)
         try:
-            # Split content into chunks if too long
-            max_input_length = 1024
-            chunks = self._split_content(content, max_input_length)
-            summaries = []
-            for chunk in chunks:
-                if len(chunk.split()) < 20:  # Skip very short chunks
-                    continue
-                result = self.summarizer(
-                    chunk,
-                    max_length=min(max_length, len(chunk.split()) // 2),
-                    min_length=30,
-                    do_sample=False
-                )
-                summaries.append(result[0]['summary_text'])
-            # Combine summaries
-            combined = ' '.join(summaries)
-            # If still too long, summarize again
-            if len(combined.split()) > max_length:
-                result = self.summarizer(
-                    combined,
-                    max_length=max_length,
-                    min_length=50,
-                    do_sample=False
-                )
-                return result[0]['summary_text']
-            return combined
         except Exception as e:
-            logger.error(f"AI summarization failed: {e}")
-            return self._extractive_summary(content)
-    def _split_content(self, content: str, max_length: int) -> List[str]:
-        """Split content into manageable chunks"""
-        sentences = sent_tokenize(content)
-        chunks = []
-        current_chunk = []
-        current_length = 0
-        for sentence in sentences:
-            sentence_length = len(sentence.split())
-            if current_length + sentence_length > max_length and current_chunk:
-                chunks.append(' '.join(current_chunk))
-                current_chunk = [sentence]
-                current_length = sentence_length
-            else:
-                current_chunk.append(sentence)
-                current_length += sentence_length
-        if current_chunk:
-            chunks.append(' '.join(current_chunk))
-        return chunks
-    def _extractive_summary(self, content: str) -> str:
-        """Fallback extractive summarization"""
-        sentences = sent_tokenize(content)
-        if len(sentences) <= 3:
-            return content
-        # Simple extractive approach: take first, middle, and last sentences
-        summary_sentences = [
-            sentences[0],
-            sentences[len(sentences) // 2],
-            sentences[-1]
-        ]
-        return ' '.join(summary_sentences)
-class WebScraperApp:
-    """Main application class"""
     def __init__(self):
-        self.extractor = ContentExtractor()
-        self.summarizer = AISummarizer()
-        self.scraped_data = []
-    def process_url(self, url: str, summary_length: int = 300) -> Tuple[str, str, str, str]:
-        """Process a single URL and return results"""
         try:
-            if not url.strip():
-                return "❌ Error", "Please enter a valid URL", "", ""
-            # Add protocol if missing
-            if not url.startswith(('http://', 'https://')):
-                url = 'https://' + url
-            # Extract content
-            with gr.update():  # Show progress
-                scraped_content = self.extractor.extract_content(url)
-            # Generate summary
-            summary = self.summarizer.summarize(scraped_content.content, summary_length)
-            scraped_content.summary = summary
-            # Store result
-            self.scraped_data.append(scraped_content)
-            # Format results
-            metadata = f"""
-            **📊 Content Analysis**
-            - **Title:** {scraped_content.title}
-            - **Author:** {scraped_content.author or 'Not found'}
-            - **Published:** {scraped_content.publish_date or 'Not found'}
-            - **Word Count:** {scraped_content.word_count:,}
-            - **Reading Time:** {scraped_content.reading_time} minutes
-            - **Extracted:** {scraped_content.extracted_at}
-            """
-            keywords_text = f"**🏷️ Keywords:** {', '.join(scraped_content.keywords[:10])}" if scraped_content.keywords else ""
-            return (
-                "✅ Success",
-                metadata,
-                f"**📝 AI Summary ({len(summary.split())} words):**\n\n{summary}",
-                keywords_text
-            )
-        except Exception as e:
-            error_msg = f"Failed to process URL: {str(e)}"
-            logger.error(error_msg)
-            return "❌ Error", error_msg, "", ""
-    def export_data(self, format_type: str) -> str:
-        """Export scraped data to file"""
-        if not self.scraped_data:
-            return "No data to export"
         try:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            if format_type == "CSV":
-                filename = f"scraped_data_{timestamp}.csv"
-                df = pd.DataFrame([
-                    {
-                        'URL': item.url,
-                        'Title': item.title,
-                        'Author': item.author,
-                        'Published': item.publish_date,
-                        'Word Count': item.word_count,
-                        'Reading Time': item.reading_time,
-                        'Summary': item.summary,
-                        'Keywords': ', '.join(item.keywords) if item.keywords else '',
-                        'Extracted At': item.extracted_at
-                    }
-                    for item in self.scraped_data
-                ])
-                df.to_csv(filename, index=False)
-            elif format_type == "JSON":
-                filename = f"scraped_data_{timestamp}.json"
-                data = [
-                    {
-                        'url': item.url,
-                        'title': item.title,
-                        'content': item.content,
-                        'summary': item.summary,
-                        'metadata': {
-                            'author': item.author,
-                            'publish_date': item.publish_date,
-                            'word_count': item.word_count,
-                            'reading_time': item.reading_time,
-                            'keywords': item.keywords,
-                            'extracted_at': item.extracted_at
-                        }
-                    }
-                    for item in self.scraped_data
-                ]
-                with open(filename, 'w', encoding='utf-8') as f:
-                    json.dump(data, f, indent=2, ensure_ascii=False)
-            return filename
         except Exception as e:
-            logger.error(f"Export failed: {e}")
-            return f"Export failed: {str(e)}"
-    def clear_data(self) -> str:
-        """Clear all scraped data"""
-        self.scraped_data.clear()
-        return "Data cleared successfully"
-def create_interface():
-    """Create the Gradio interface"""
-    app = WebScraperApp()
-    # Custom CSS for professional appearance
     custom_css = """
     .gradio-container {
-        max-width: 1200px;
         margin: auto;
     }
-    .main-header {
-        text-align: center;
-        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
         color: white;
         padding: 2rem;
-        border-radius: 10px;
         margin-bottom: 2rem;
     }
-    .feature-box {
-        background: #f8f9fa;
-        border: 1px solid #e9ecef;
-        border-radius: 8px;
         padding: 1.5rem;
         margin: 1rem 0;
     }
-    .status-success {
-        color: #28a745;
-        font-weight: bold;
     }
-    .status-error {
-        color: #dc3545;
         font-weight: bold;
     }
     """
-    with gr.Blocks(css=custom_css, title="AI Web Scraper") as interface:
         # Header
         gr.HTML("""
-        <div class="main-header">
-            <h1>🤖 AI-Powered Web Scraper</h1>
-            <p>Professional content extraction and summarization for journalists, analysts, and researchers</p>
         </div>
         """)
-        # Main interface
-        with gr.Row():
-            with gr.Column(scale=2):
-                # Input section
-                gr.HTML("<div class='feature-box'><h3>📡 Content Extraction</h3></div>")
-                url_input = gr.Textbox(
-                    label="Enter URL to scrape",
-                    placeholder="https://example.com/article",
-                    lines=1
-                )
                 with gr.Row():
-                    summary_length = gr.Slider(
-                        minimum=100,
-                        maximum=500,
-                        value=300,
-                        step=50,
-                        label="Summary Length (words)"
-                    )
-                scrape_btn = gr.Button("🚀 Extract & Summarize", variant="primary", size="lg")
-                # Results section
-                gr.HTML("<div class='feature-box'><h3>📊 Results</h3></div>")
-                status_output = gr.Textbox(label="Status", lines=1, interactive=False)
-                metadata_output = gr.Markdown(label="Metadata")
-                summary_output = gr.Markdown(label="AI Summary")
-                keywords_output = gr.Markdown(label="Keywords")
-            with gr.Column(scale=1):
-                # Export section
-                gr.HTML("<div class='feature-box'><h3>💾 Export Options</h3></div>")
-                export_format = gr.Radio(
-                    choices=["CSV", "JSON"],
-                    label="Export Format",
-                    value="CSV"
-                )
-                export_btn = gr.Button("📥 Export Data", variant="secondary")
-                export_status = gr.Textbox(label="Export Status", lines=2, interactive=False)
-                gr.HTML("<div class='feature-box'><h3>🧹 Data Management</h3></div>")
-                clear_btn = gr.Button("🗑️ Clear All Data", variant="secondary")
-                clear_status = gr.Textbox(label="Clear Status", lines=1, interactive=False)
-        # Usage instructions
-        with gr.Accordion("📚 Usage Instructions", open=False):
-            gr.Markdown("""
-            ### How to Use This Tool
-            1. **Enter URL**: Paste the URL of the article or webpage you want to analyze
-            2. **Adjust Settings**: Set your preferred summary length
-            3. **Extract Content**: Click "Extract & Summarize" to process the content
-            4. **Review Results**: View the extracted metadata, AI summary, and keywords
-            5. **Export Data**: Save your results in CSV or JSON format
-            ### Features
-            - 🛡️ **Security**: Built-in URL validation and robots.txt compliance
-            - 🤖 **AI Summarization**: Advanced BART model for intelligent summarization
-            - 📊 **Rich Metadata**: Author, publication date, reading time, and more
-            - 🏷️ **Keyword Extraction**: Automatic identification of key terms
-            - 💾 **Export Options**: CSV and JSON formats for further analysis
-            - 🔄 **Batch Processing**: Process multiple URLs and export all results
-            ### Supported Content
-            - News articles and blog posts
-            - Research papers and reports
-            - Documentation and guides
-            - Most HTML-based content
-            ### Limitations
-            - Respects robots.txt restrictions
-            - Cannot access password-protected content
-            - Some dynamic content may not be captured
-            - Processing time varies with content length
-            """)
-        # Event handlers
         scrape_btn.click(
-            fn=app.process_url,
-            inputs=[url_input, summary_length],
-            outputs=[status_output, metadata_output, summary_output, keywords_output]
         )
         export_btn.click(
-            fn=app.export_data,
-            inputs=[export_format],
-            outputs=[export_status]
         )
-        clear_btn.click(
-            fn=app.clear_data,
-            outputs=[clear_status]
         )
     return interface
 # Launch the application
 if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True
-    )

 """
+AI Dataset Studio - Modern Web Scraping & Dataset Creation Platform
+A mini Scale AI for non-coders and vibe coders
+Features:
+- Intelligent web scraping with content extraction
+- Automated data cleaning and preprocessing
+- Interactive annotation tools
+- Template-based workflows for common ML tasks
+- High-quality dataset generation
+- Export to HuggingFace Hub and popular ML formats
+- Visual data quality metrics
+- No-code dataset creation workflows
 """
 import gradio as gr
 import pandas as pd
+import numpy as np
 import json
 import re
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse, urljoin
+from datetime import datetime, timedelta
 import logging
+from typing import Dict, List, Tuple, Optional, Any
+from dataclasses import dataclass, asdict
 from pathlib import Path
+import uuid
 import hashlib
+import time
+from collections import defaultdict
+import io
+import zipfile
+# Optional imports with fallbacks
+try:
+    from transformers import pipeline, AutoTokenizer, AutoModel
+    from sentence_transformers import SentenceTransformer
+    HAS_TRANSFORMERS = True
+except ImportError:
+    HAS_TRANSFORMERS = False
 try:
+    import nltk
+    from nltk.tokenize import sent_tokenize, word_tokenize
+    from nltk.corpus import stopwords
+    HAS_NLTK = True
+except ImportError:
+    HAS_NLTK = False
+try:
+    from datasets import Dataset, DatasetDict
+    HAS_DATASETS = True
+except ImportError:
+    HAS_DATASETS = False
 # Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Download NLTK data if available
+if HAS_NLTK:
+    try:
+        nltk.download('punkt', quiet=True)
+        nltk.download('stopwords', quiet=True)
+        nltk.download('averaged_perceptron_tagger', quiet=True)
+    except:
+        pass
 @dataclass
+class ScrapedItem:
+    """Data class for scraped content"""
+    id: str
     url: str
     title: str
     content: str
+    metadata: Dict[str, Any]
+    scraped_at: str
     word_count: int
+    language: str = "en"
+    quality_score: float = 0.0
+    labels: List[str] = None
+    annotations: Dict[str, Any] = None
+    def __post_init__(self):
+        if self.labels is None:
+            self.labels = []
+        if self.annotations is None:
+            self.annotations = {}
+@dataclass
+class DatasetTemplate:
+    """Template for dataset creation"""
+    name: str
+    description: str
+    task_type: str  # classification, ner, qa, summarization, etc.
+    required_fields: List[str]
+    optional_fields: List[str]
+    example_format: Dict[str, Any]
+    instructions: str
+class WebScraperEngine:
+    """Advanced web scraping engine with smart content extraction"""
     def __init__(self):
         self.session = requests.Session()
         self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0; Research)',
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate',
             'Connection': 'keep-alive',
         })
+        # Initialize AI models if available
+        self.content_classifier = None
+        self.quality_scorer = None
+        self._load_models()
+    def _load_models(self):
+        """Load AI models for content analysis"""
+        if not HAS_TRANSFORMERS:
+            logger.warning("⚠️ Transformers not available, using rule-based methods")
+            return
         try:
+            # Content quality assessment
+            self.quality_scorer = pipeline(
+                "text-classification",
+                model="martin-ha/toxic-comment-model",
+                return_all_scores=True
+            )
+            logger.info("✅ Quality assessment model loaded")
+        except Exception as e:
+            logger.warning(f"⚠️ Could not load quality model: {e}")
+    def scrape_url(self, url: str) -> Optional[ScrapedItem]:
+        """Scrape a single URL and return structured data"""
+        try:
+            # Validate URL
+            if not self._is_valid_url(url):
+                raise ValueError("Invalid URL provided")
             # Fetch content
             response = self.session.get(url, timeout=15)
             # Parse HTML
             soup = BeautifulSoup(response.content, 'html.parser')
+            # Extract structured data
             title = self._extract_title(soup)
+            content = self._extract_content(soup)
+            metadata = self._extract_metadata(soup, response)
+            # Create scraped item
+            item = ScrapedItem(
+                id=str(uuid.uuid4()),
                 url=url,
                 title=title,
                 content=content,
+                metadata=metadata,
+                scraped_at=datetime.now().isoformat(),
+                word_count=len(content.split()),
+                quality_score=self._assess_quality(content)
             )
+            return item
         except Exception as e:
+            logger.error(f"Failed to scrape {url}: {e}")
+            return None
+    def batch_scrape(self, urls: List[str], progress_callback=None) -> List[ScrapedItem]:
+        """Scrape multiple URLs with progress tracking"""
+        results = []
+        total = len(urls)
+        for i, url in enumerate(urls):
+            if progress_callback:
+                progress_callback(i / total, f"Scraping {i+1}/{total}: {url[:50]}...")
+            item = self.scrape_url(url)
+            if item:
+                results.append(item)
+            # Rate limiting
+            time.sleep(1)
+        return results
+    def _is_valid_url(self, url: str) -> bool:
+        """Validate URL format and safety"""
+        try:
+            parsed = urlparse(url)
+            return parsed.scheme in ['http', 'https'] and parsed.netloc
+        except:
+            return False
+    def _extract_title(self, soup: BeautifulSoup) -> str:
+        """Extract page title"""
+        # Try multiple selectors
+        selectors = [
+            'meta[property="og:title"]',
+            'meta[name="twitter:title"]',
+            'title',
+            'h1'
         ]
+        for selector in selectors:
             element = soup.select_one(selector)
             if element:
                 if element.name == 'meta':
                     return element.get('content', '').strip()
                 else:
                     return element.get_text().strip()
+        return "Untitled"
+    def _extract_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content using multiple strategies"""
         # Remove unwanted elements
+        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
             element.decompose()
+        # Try content-specific selectors
         content_selectors = [
             'article',
             'main',
             '.post-content',
             '.entry-content',
             '.article-body',
+            '[role="main"]'
         ]
         for selector in content_selectors:
             element = soup.select_one(selector)
             if element:
                 text = element.get_text(separator=' ', strip=True)
+                if len(text) > 200:
                     return self._clean_text(text)
+        # Fallback to body
         body = soup.find('body')
         if body:
+            return self._clean_text(body.get_text(separator=' ', strip=True))
         return self._clean_text(soup.get_text(separator=' ', strip=True))
+    def _extract_metadata(self, soup: BeautifulSoup, response) -> Dict[str, Any]:
+        """Extract metadata from page"""
+        metadata = {
+            'domain': urlparse(response.url).netloc,
+            'status_code': response.status_code,
+            'content_type': response.headers.get('content-type', ''),
+            'extracted_at': datetime.now().isoformat()
+        }
+        # Extract meta tags
+        meta_tags = ['description', 'keywords', 'author', 'published_time']
+        for tag in meta_tags:
+            element = soup.find('meta', attrs={'name': tag}) or soup.find('meta', attrs={'property': f'article:{tag}'})
+            if element:
+                metadata[tag] = element.get('content', '')
+        return metadata
     def _clean_text(self, text: str) -> str:
         """Clean extracted text"""
         # Remove extra whitespace
         text = re.sub(r'\s+', ' ', text)
+        # Remove common patterns
+        patterns = [
+            r'Subscribe.*?newsletter',
+            r'Click here.*?more',
+            r'Advertisement',
+            r'Share this.*?social',
+            r'Follow us on.*?media'
+        ]
+        for pattern in patterns:
+            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
         return text.strip()
+    def _assess_quality(self, content: str) -> float:
+        """Assess content quality (0-1 score)"""
+        if not content:
+            return 0.0
+        score = 0.0
+        # Length check
+        word_count = len(content.split())
+        if word_count >= 50:
+            score += 0.3
+        elif word_count >= 20:
+            score += 0.1
+        # Structure check (sentences)
+        sentence_count = len(re.split(r'[.!?]+', content))
+        if sentence_count >= 3:
+            score += 0.2
+        # Language quality (basic)
+        if re.search(r'[A-Z][a-z]+', content):  # Proper capitalization
+            score += 0.2
+        if not re.search(r'[^\w\s]', content[:100]):  # No weird characters at start
+            score += 0.1
+        # Readability (simple check)
+        avg_word_length = np.mean([len(word) for word in content.split()])
+        if 3 <= avg_word_length <= 8:
+            score += 0.2
+        return min(score, 1.0)
+class DataProcessor:
+    """Advanced data processing and cleaning pipeline"""
     def __init__(self):
+        self.language_detector = None
+        self.sentiment_analyzer = None
+        self.ner_model = None
+        self._load_models()
+    def _load_models(self):
+        """Load NLP models for processing"""
+        if not HAS_TRANSFORMERS:
+            return
         try:
+            # Sentiment analysis
+            self.sentiment_analyzer = pipeline(
+                "sentiment-analysis",
+                model="cardiffnlp/twitter-roberta-base-sentiment-latest"
+            )
+            # Named Entity Recognition
+            self.ner_model = pipeline(
+                "ner",
+                model="dbmdz/bert-large-cased-finetuned-conll03-english",
+                aggregation_strategy="simple"
             )
+            logger.info("✅ NLP models loaded successfully")
         except Exception as e:
+            logger.warning(f"⚠️ Could not load NLP models: {e}")
+    def process_items(self, items: List[ScrapedItem], processing_options: Dict[str, bool]) -> List[ScrapedItem]:
+        """Process scraped items with various enhancement options"""
+        processed_items = []
+        for item in items:
+            processed_item = self._process_single_item(item, processing_options)
+            if processed_item:
+                processed_items.append(processed_item)
+        return processed_items
+    def _process_single_item(self, item: ScrapedItem, options: Dict[str, bool]) -> Optional[ScrapedItem]:
+        """Process a single item"""
         try:
+            # Clean content
+            if options.get('clean_text', True):
+                item.content = self._clean_text_advanced(item.content)
+            # Filter by quality
+            if options.get('quality_filter', True) and item.quality_score < 0.3:
+                return None
+            # Add sentiment analysis
+            if options.get('add_sentiment', False) and self.sentiment_analyzer:
+                sentiment = self._analyze_sentiment(item.content)
+                item.metadata['sentiment'] = sentiment
+            # Add named entities
+            if options.get('extract_entities', False) and self.ner_model:
+                entities = self._extract_entities(item.content)
+                item.metadata['entities'] = entities
+            # Add language detection
+            if options.get('detect_language', True):
+                item.language = self._detect_language(item.content)
+            return item
         except Exception as e:
+            logger.error(f"Error processing item {item.id}: {e}")
+            return None
+    def _clean_text_advanced(self, text: str) -> str:
+        """Advanced text cleaning"""
+        # Remove URLs
+        text = re.sub(r'http\S+|www\.\S+', '', text)
+        # Remove email addresses
+        text = re.sub(r'\S+@\S+', '', text)
+        # Remove excessive punctuation
+        text = re.sub(r'[!?]{2,}', '!', text)
+        text = re.sub(r'\.{3,}', '...', text)
+        # Normalize whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove very short paragraphs (likely navigation)
+        paragraphs = text.split('\n')
+        paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 20]
+        return '\n'.join(paragraphs).strip()
+    def _analyze_sentiment(self, text: str) -> Dict[str, Any]:
+        """Analyze sentiment of text"""
+        try:
+            # Truncate text for model limits
+            text_sample = text[:512]
+            result = self.sentiment_analyzer(text_sample)[0]
+            return {
+                'label': result['label'],
+                'score': result['score']
+            }
+        except:
+            return {'label': 'UNKNOWN', 'score': 0.0}
+    def _extract_entities(self, text: str) -> List[Dict[str, Any]]:
+        """Extract named entities"""
+        try:
+            # Truncate text for model limits
+            text_sample = text[:512]
+            entities = self.ner_model(text_sample)
+            return [
+                {
+                    'text': ent['word'],
+                    'label': ent['entity_group'],
+                    'confidence': ent['score']
+                }
+                for ent in entities
+            ]
+        except:
+            return []
+    def _detect_language(self, text: str) -> str:
+        """Simple language detection"""
+        # Basic heuristic - could be enhanced with proper language detection
+        if re.search(r'[а-яё]', text.lower()):
+            return 'ru'
+        elif re.search(r'[ñáéíóúü]', text.lower()):
+            return 'es'
+        elif re.search(r'[àâäçéèêëïîôöùûüÿ]', text.lower()):
+            return 'fr'
+        else:
+            return 'en'
+class AnnotationEngine:
+    """Interactive annotation tools for dataset creation"""
+    def __init__(self):
+        self.templates = self._load_templates()
+    def _load_templates(self) -> Dict[str, DatasetTemplate]:
+        """Load predefined dataset templates"""
+        templates = {
+            'text_classification': DatasetTemplate(
+                name="Text Classification",
+                description="Classify text into predefined categories",
+                task_type="classification",
+                required_fields=["text", "label"],
+                optional_fields=["confidence", "metadata"],
+                example_format={"text": "Sample text", "label": "positive"},
+                instructions="Label each text with the appropriate category"
+            ),
+            'sentiment_analysis': DatasetTemplate(
+                name="Sentiment Analysis",
+                description="Analyze emotional tone of text",
+                task_type="classification",
+                required_fields=["text", "sentiment"],
+                optional_fields=["confidence", "aspects"],
+                example_format={"text": "I love this!", "sentiment": "positive"},
+                instructions="Classify the sentiment as positive, negative, or neutral"
+            ),
+            'named_entity_recognition': DatasetTemplate(
+                name="Named Entity Recognition",
+                description="Identify and classify named entities in text",
+                task_type="ner",
+                required_fields=["text", "entities"],
+                optional_fields=["metadata"],
+                example_format={
+                    "text": "John works at OpenAI in San Francisco",
+                    "entities": [
+                        {"text": "John", "label": "PERSON", "start": 0, "end": 4},
+                        {"text": "OpenAI", "label": "ORG", "start": 14, "end": 20}
+                    ]
+                },
+                instructions="Mark all named entities (people, organizations, locations, etc.)"
+            ),
+            'question_answering': DatasetTemplate(
+                name="Question Answering",
+                description="Create question-answer pairs from text",
+                task_type="qa",
+                required_fields=["context", "question", "answer"],
+                optional_fields=["answer_start", "metadata"],
+                example_format={
+                    "context": "The capital of France is Paris.",
+                    "question": "What is the capital of France?",
+                    "answer": "Paris"
+                },
+                instructions="Create meaningful questions and provide accurate answers"
+            ),
+            'summarization': DatasetTemplate(
+                name="Text Summarization",
+                description="Create concise summaries of longer texts",
+                task_type="summarization",
+                required_fields=["text", "summary"],
+                optional_fields=["summary_type", "length"],
+                example_format={
+                    "text": "Long article text...",
+                    "summary": "Brief summary of the main points"
+                },
+                instructions="Write clear, concise summaries capturing key information"
+            )
+        }
+        return templates
+    def create_annotation_interface(self, template_name: str, items: List[ScrapedItem]) -> Dict[str, Any]:
+        """Create annotation interface for specific template"""
+        template = self.templates.get(template_name)
+        if not template:
+            raise ValueError(f"Unknown template: {template_name}")
+        # Prepare data for annotation
+        annotation_data = []
+        for item in items:
+            annotation_data.append({
+                'id': item.id,
+                'text': item.content[:1000],  # Truncate for UI
+                'title': item.title,
+                'url': item.url,
+                'annotations': {}
+            })
+        return {
+            'template': template,
+            'data': annotation_data,
+            'progress': 0,
+            'completed': 0
+        }
+class DatasetExporter:
+    """Export datasets in various formats for ML frameworks"""
     def __init__(self):
+        self.supported_formats = [
+            'huggingface_datasets',
+            'json',
+            'csv',
+            'parquet',
+            'jsonl',
+            'pytorch',
+            'tensorflow'
+        ]
+    def export_dataset(self, items: List[ScrapedItem], template: DatasetTemplate,
+                      export_format: str, annotations: Dict[str, Any] = None) -> str:
+        """Export annotated dataset in specified format"""
         try:
+            # Prepare dataset
+            dataset_data = self._prepare_dataset_data(items, template, annotations)
+            # Export based on format
+            if export_format == 'huggingface_datasets':
+                return self._export_huggingface(dataset_data, template)
+            elif export_format == 'json':
+                return self._export_json(dataset_data)
+            elif export_format == 'csv':
+                return self._export_csv(dataset_data)
+            elif export_format == 'jsonl':
+                return self._export_jsonl(dataset_data)
+            else:
+                raise ValueError(f"Unsupported format: {export_format}")
+        except Exception as e:
+            logger.error(f"Export failed: {e}")
+            raise
+    def _prepare_dataset_data(self, items: List[ScrapedItem], template: DatasetTemplate,
+                            annotations: Dict[str, Any] = None) -> List[Dict[str, Any]]:
+        """Prepare data according to template format"""
+        dataset_data = []
+        for item in items:
+            # Base data from scraped item
+            data_point = {
+                'text': item.content,
+                'title': item.title,
+                'url': item.url,
+                'metadata': item.metadata
+            }
+            # Add annotations if available
+            if annotations and item.id in annotations:
+                item_annotations = annotations[item.id]
+                data_point.update(item_annotations)
+            # Format according to template
+            formatted_point = self._format_for_template(data_point, template)
+            if formatted_point:
+                dataset_data.append(formatted_point)
+        return dataset_data
+    def _format_for_template(self, data_point: Dict[str, Any], template: DatasetTemplate) -> Dict[str, Any]:
+        """Format data point according to template requirements"""
+        formatted = {}
+        # Ensure required fields are present
+        for field in template.required_fields:
+            if field in data_point:
+                formatted[field] = data_point[field]
+            elif field == 'text' and 'content' in data_point:
+                formatted[field] = data_point['content']
+            else:
+                # Skip this data point if required field is missing
+                return None
+        # Add optional fields if present
+        for field in template.optional_fields:
+            if field in data_point:
+                formatted[field] = data_point[field]
+        return formatted
+    def _export_huggingface(self, dataset_data: List[Dict[str, Any]], template: DatasetTemplate) -> str:
+        """Export as HuggingFace Dataset"""
+        if not HAS_DATASETS:
+            raise ImportError("datasets library not available")
         try:
+            # Create dataset
+            dataset = Dataset.from_list(dataset_data)
+            # Create dataset card
+            card_content = f"""
+# {template.name} Dataset
+## Description
+{template.description}
+## Task Type
+{template.task_type}
+## Format
+{template.example_format}
+## Instructions
+{template.instructions}
+## Statistics
+- Total samples: {len(dataset_data)}
+- Created: {datetime.now().isoformat()}
+## Usage
+```python
+from datasets import load_dataset
+dataset = load_dataset('path/to/dataset')
+```
+"""
+            # Save dataset
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            dataset_name = f"{template.name.lower().replace(' ', '_')}_{timestamp}"
+            # Save locally (would push to Hub in production)
+            dataset.save_to_disk(dataset_name)
+            # Create info file
+            with open(f"{dataset_name}/README.md", "w") as f:
+                f.write(card_content)
+            return dataset_name
         except Exception as e:
+            logger.error(f"HuggingFace export failed: {e}")
+            raise
+    def _export_json(self, dataset_data: List[Dict[str, Any]]) -> str:
+        """Export as JSON file"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"dataset_{timestamp}.json"
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump(dataset_data, f, indent=2, ensure_ascii=False)
+        return filename
+    def _export_csv(self, dataset_data: List[Dict[str, Any]]) -> str:
+        """Export as CSV file"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"dataset_{timestamp}.csv"
+        df = pd.DataFrame(dataset_data)
+        df.to_csv(filename, index=False)
+        return filename
+    def _export_jsonl(self, dataset_data: List[Dict[str, Any]]) -> str:
+        """Export as JSONL file"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"dataset_{timestamp}.jsonl"
+        with open(filename, 'w', encoding='utf-8') as f:
+            for item in dataset_data:
+                f.write(json.dumps(item, ensure_ascii=False) + '\n')
+        return filename
+def create_modern_interface():
+    """Create modern, intuitive interface for AI Dataset Studio"""
+    # Initialize the studio
+    studio = DatasetStudio()
+    # Custom CSS for modern appearance
     custom_css = """
     .gradio-container {
+        max-width: 1400px;
         margin: auto;
+        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
     }
+    .studio-header {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
         color: white;
         padding: 2rem;
+        border-radius: 15px;
         margin-bottom: 2rem;
+        text-align: center;
+        box-shadow: 0 8px 32px rgba(0,0,0,0.1);
     }
+    .workflow-card {
+        background: #f8f9ff;
+        border: 2px solid #e1e5ff;
+        border-radius: 12px;
         padding: 1.5rem;
         margin: 1rem 0;
+        transition: all 0.3s ease;
     }
+    .workflow-card:hover {
+        border-color: #667eea;
+        box-shadow: 0 4px 20px rgba(102, 126, 234, 0.1);
     }
+    .step-header {
+        display: flex;
+        align-items: center;
+        margin-bottom: 1rem;
+        font-size: 1.2em;
+        font-weight: 600;
+        color: #4c51bf;
+    }
+    .step-number {
+        background: #667eea;
+        color: white;
+        border-radius: 50%;
+        width: 30px;
+        height: 30px;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        margin-right: 1rem;
         font-weight: bold;
     }
+    .feature-grid {
+        display: grid;
+        grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+        gap: 1rem;
+        margin: 1rem 0;
+    }
+    .feature-item {
+        background: white;
+        border: 1px solid #e2e8f0;
+        border-radius: 8px;
+        padding: 1rem;
+        text-align: center;
+    }
+    .stat-card {
+        background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+        color: white;
+        padding: 1rem;
+        border-radius: 10px;
+        text-align: center;
+        margin: 0.5rem;
+    }
+    .progress-bar {
+        background: #e2e8f0;
+        border-radius: 10px;
+        height: 8px;
+        overflow: hidden;
+    }
+    .progress-fill {
+        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+        height: 100%;
+        transition: width 0.3s ease;
+    }
+    .template-card {
+        border: 2px solid #e2e8f0;
+        border-radius: 10px;
+        padding: 1rem;
+        margin: 0.5rem;
+        cursor: pointer;
+        transition: all 0.3s ease;
+    }
+    .template-card:hover {
+        border-color: #667eea;
+        transform: translateY(-2px);
+        box-shadow: 0 4px 12px rgba(0,0,0,0.1);
+    }
+    .template-selected {
+        border-color: #667eea;
+        background: #f7fafc;
+    }
+    .export-option {
+        background: #f7fafc;
+        border: 1px solid #e2e8f0;
+        border-radius: 8px;
+        padding: 1rem;
+        margin: 0.5rem 0;
+        cursor: pointer;
+    }
+    .export-option:hover {
+        background: #edf2f7;
+        border-color: #cbd5e0;
+    }
+    .success-message {
+        background: #f0fff4;
+        border: 1px solid #9ae6b4;
+        color: #276749;
+        padding: 1rem;
+        border-radius: 8px;
+        margin: 1rem 0;
+    }
+    .error-message {
+        background: #fed7d7;
+        border: 1px solid #feb2b2;
+        color: #c53030;
+        padding: 1rem;
+        border-radius: 8px;
+        margin: 1rem 0;
+    }
     """
+    # Project state for UI
+    project_state = gr.State({})
+    with gr.Blocks(css=custom_css, title="AI Dataset Studio", theme=gr.themes.Soft()) as interface:
         # Header
         gr.HTML("""
+        <div class="studio-header">
+            <h1>🚀 AI Dataset Studio</h1>
+            <p>Create high-quality training datasets without coding - Your personal Scale AI</p>
+            <p style="opacity: 0.9; font-size: 0.9em;">Web Scraping → Data Processing → Annotation → ML-Ready Datasets</p>
         </div>
         """)
+        # Main workflow tabs
+        with gr.Tabs() as main_tabs:
+            # Tab 1: Project Setup
+            with gr.Tab("🎯 Project Setup", id="setup"):
+                gr.HTML('<div class="step-header"><div class="step-number">1</div>Start Your Dataset Project</div>')
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>📋 Project Configuration</h3>
+                            <p>Define your dataset project and choose the type of AI task you're building for.</p>
+                        </div>
+                        """)
+                        project_name = gr.Textbox(
+                            label="Project Name",
+                            placeholder="e.g., 'News Sentiment Analysis' or 'Product Review Classification'",
+                            value="My Dataset Project"
+                        )
+                        # Template selection with visual cards
+                        gr.HTML("<h4>🎨 Choose Your Dataset Template</h4>")
+                        template_choice = gr.Radio(
+                            choices=[
+                                ("📊 Text Classification", "text_classification"),
+                                ("😊 Sentiment Analysis", "sentiment_analysis"),
+                                ("👥 Named Entity Recognition", "named_entity_recognition"),
+                                ("❓ Question Answering", "question_answering"),
+                                ("📝 Text Summarization", "summarization")
+                            ],
+                            label="Dataset Type",
+                            value="text_classification",
+                            interactive=True
+                        )
+                        create_project_btn = gr.Button(
+                            "🚀 Create Project",
+                            variant="primary",
+                            size="lg"
+                        )
+                        project_status = gr.Markdown("")
+                    with gr.Column(scale=1):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>💡 Template Guide</h3>
+                            <div class="feature-grid">
+                                <div class="feature-item">
+                                    <h4>📊 Text Classification</h4>
+                                    <p>Categorize text into predefined labels</p>
+                                    <small>Great for: Spam detection, topic classification</small>
+                                </div>
+                                <div class="feature-item">
+                                    <h4>😊 Sentiment Analysis</h4>
+                                    <p>Analyze emotional tone and opinions</p>
+                                    <small>Great for: Review analysis, social media monitoring</small>
+                                </div>
+                                <div class="feature-item">
+                                    <h4>👥 Named Entity Recognition</h4>
+                                    <p>Identify people, places, organizations</p>
+                                    <small>Great for: Information extraction, content tagging</small>
+                                </div>
+                            </div>
+                        </div>
+                        """)
+            # Tab 2: Data Collection
+            with gr.Tab("🕷️ Data Collection", id="collection"):
+                gr.HTML('<div class="step-header"><div class="step-number">2</div>Collect Your Data</div>')
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>🌐 Web Scraping</h3>
+                            <p>Provide URLs to scrape content automatically. Our AI will extract clean, structured text.</p>
+                        </div>
+                        """)
+                        # URL input methods
+                        with gr.Tabs():
+                            with gr.Tab("📝 Manual Input"):
+                                urls_input = gr.Textbox(
+                                    label="URLs to Scrape",
+                                    placeholder="https://example.com/article1\nhttps://example.com/article2\n...",
+                                    lines=8,
+                                    info="Enter one URL per line"
+                                )
+                            with gr.Tab("📎 File Upload"):
+                                urls_file = gr.File(
+                                    label="Upload URL List",
+                                    file_types=[".txt", ".csv"],
+                                    info="Upload a text file with URLs (one per line) or CSV with 'url' column"
+                                )
+                        scrape_btn = gr.Button("🚀 Start Scraping", variant="primary", size="lg")
+                        # Progress tracking
+                        scraping_progress = gr.Progress()
+                        scraping_status = gr.Markdown("")
+                    with gr.Column(scale=1):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>⚡ Features</h3>
+                            <ul style="list-style: none; padding: 0;">
+                                <li>✅ Smart content extraction</li>
+                                <li>✅ Quality scoring</li>
+                                <li>✅ Duplicate detection</li>
+                                <li>✅ Security validation</li>
+                                <li>✅ Metadata extraction</li>
+                                <li>✅ Rate limiting</li>
+                            </ul>
+                        </div>
+                        """)
+                        # Quick stats
+                        collection_stats = gr.HTML("")
+            # Tab 3: Data Processing
+            with gr.Tab("⚙️ Data Processing", id="processing"):
+                gr.HTML('<div class="step-header"><div class="step-number">3</div>Clean & Enhance Your Data</div>')
                 with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>🔧 Processing Options</h3>
+                            <p>Configure how to clean and enhance your scraped data with AI-powered analysis.</p>
+                        </div>
+                        """)
+                        # Processing options
+                        with gr.Row():
+                            with gr.Column():
+                                clean_text = gr.Checkbox(label="🧹 Advanced Text Cleaning", value=True)
+                                quality_filter = gr.Checkbox(label="🎯 Quality Filtering", value=True)
+                                detect_language = gr.Checkbox(label="🌍 Language Detection", value=True)
+                            with gr.Column():
+                                add_sentiment = gr.Checkbox(label="😊 Sentiment Analysis", value=False)
+                                extract_entities = gr.Checkbox(label="👥 Entity Extraction", value=False)
+                                deduplicate = gr.Checkbox(label="🔄 Remove Duplicates", value=True)
+                        process_btn = gr.Button("⚙️ Process Data", variant="primary", size="lg")
+                        processing_status = gr.Markdown("")
+                    with gr.Column(scale=1):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>📊 Processing Stats</h3>
+                            <div id="processing-stats"></div>
+                        </div>
+                        """)
+                        processing_stats = gr.HTML("")
+            # Tab 4: Data Preview
+            with gr.Tab("👀 Data Preview", id="preview"):
+                gr.HTML('<div class="step-header"><div class="step-number">4</div>Review Your Dataset</div>')
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>📋 Dataset Preview</h3>
+                            <p>Review your processed data before annotation or export.</p>
+                        </div>
+                        """)
+                        refresh_preview_btn = gr.Button("🔄 Refresh Preview", variant="secondary")
+                        # Data preview table
+                        data_preview = gr.DataFrame(
+                            headers=["Title", "Content Preview", "Word Count", "Quality Score", "URL"],
+                            label="Dataset Preview",
+                            interactive=False
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>📈 Dataset Statistics</h3>
+                        </div>
+                        """)
+                        dataset_stats = gr.JSON(label="Statistics")
+            # Tab 5: Export
+            with gr.Tab("📤 Export Dataset", id="export"):
+                gr.HTML('<div class="step-header"><div class="step-number">5</div>Export Your Dataset</div>')
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>💾 Export Options</h3>
+                            <p>Export your dataset in various formats for different ML frameworks and platforms.</p>
+                        </div>
+                        """)
+                        # Export format selection
+                        export_format = gr.Radio(
+                            choices=[
+                                ("🤗 HuggingFace Datasets", "huggingface_datasets"),
+                                ("📄 JSON", "json"),
+                                ("📊 CSV", "csv"),
+                                ("📋 JSONL", "jsonl"),
+                                ("⚡ Parquet", "parquet")
+                            ],
+                            label="Export Format",
+                            value="json"
+                        )
+                        # Template for export
+                        export_template = gr.Dropdown(
+                            choices=[
+                                "text_classification",
+                                "sentiment_analysis",
+                                "named_entity_recognition",
+                                "question_answering",
+                                "summarization"
+                            ],
+                            label="Dataset Template",
+                            value="text_classification"
+                        )
+                        export_btn = gr.Button("📤 Export Dataset", variant="primary", size="lg")
+                        # Export results
+                        export_status = gr.Markdown("")
+                        export_file = gr.File(label="Download Dataset", visible=False)
+                    with gr.Column(scale=1):
+                        gr.HTML("""
+                        <div class="workflow-card">
+                            <h3>📋 Export Formats</h3>
+                            <div class="feature-item">
+                                <h4>🤗 HuggingFace</h4>
+                                <p>Ready for transformers library</p>
+                            </div>
+                            <div class="feature-item">
+                                <h4>📄 JSON/JSONL</h4>
+                                <p>Universal format for any framework</p>
+                            </div>
+                            <div class="feature-item">
+                                <h4>📊 CSV</h4>
+                                <p>Easy analysis in Excel/Pandas</p>
+                            </div>
+                        </div>
+                        """)
+        # Event handlers
+        def create_project(name, template):
+            """Create new project"""
+            if not name.strip():
+                return "❌ Please enter a project name", {}
+            project = studio.start_new_project(name.strip(), template)
+            status = f"""
+            ✅ **Project Created Successfully!**
+            **Project:** {project['name']}
+            **Type:** {template.replace('_', ' ').title()}
+            **ID:** {project['id'][:8]}...
+            **Created:** {project['created_at'][:19]}
+            👉 **Next Step:** Go to the Data Collection tab to start scraping URLs
+            """
+            return status, project
+        def scrape_urls_handler(urls_text, urls_file, project, progress=gr.Progress()):
+            """Handle URL scraping"""
+            if not project:
+                return "❌ Please create a project first", ""
+            # Process URLs from text input or file
+            urls = []
+            if urls_text:
+                urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
+            elif urls_file:
+                # Handle file upload (simplified)
+                try:
+                    content = urls_file.read().decode('utf-8')
+                    urls = [url.strip() for url in content.split('\n') if url.strip()]
+                except:
+                    return "❌ Error reading uploaded file", ""
+            if not urls:
+                return "❌ No URLs provided", ""
+            # Progress callback
+            def progress_callback(pct, msg):
+                progress(pct, desc=msg)
+            # Scrape URLs
+            success_count, errors = studio.scrape_urls(urls, progress_callback)
+            if success_count > 0:
+                stats_html = f"""
+                <div class="stat-card">
+                    <h3>✅ Scraping Complete</h3>
+                    <p><strong>{success_count}</strong> items collected</p>
+                    <p><strong>{len(urls) - success_count}</strong> failed</p>
+                </div>
+                """
+                status = f"""
+                ✅ **Scraping Complete!**
+                **Successfully scraped:** {success_count} URLs
+                **Failed:** {len(urls) - success_count} URLs
+                👉 **Next Step:** Go to Data Processing tab to clean and enhance your data
+                """
+                return status, stats_html
+            else:
+                return f"❌ Scraping failed: {', '.join(errors)}", ""
+        def process_data_handler(clean_text, quality_filter, detect_language,
+                               add_sentiment, extract_entities, deduplicate, project):
+            """Handle data processing"""
+            if not project:
+                return "❌ Please create a project first", ""
+            if not studio.scraped_items:
+                return "❌ No scraped data to process. Please scrape URLs first.", ""
+            # Configure processing options
+            options = {
+                'clean_text': clean_text,
+                'quality_filter': quality_filter,
+                'detect_language': detect_language,
+                'add_sentiment': add_sentiment,
+                'extract_entities': extract_entities,
+                'deduplicate': deduplicate
+            }
+            # Process data
+            processed_count = studio.process_data(options)
+            if processed_count > 0:
+                stats = studio.get_data_statistics()
+                stats_html = f"""
+                <div class="stat-card">
+                    <h3>⚙️ Processing Complete</h3>
+                    <p><strong>{processed_count}</strong> items processed</p>
+                    <p>Avg Quality: <strong>{stats.get('avg_quality_score', 0)}</strong></p>
+                    <p>Avg Words: <strong>{stats.get('avg_word_count', 0)}</strong></p>
+                </div>
+                """
+                status = f"""
+                ✅ **Processing Complete!**
+                **Processed items:** {processed_count}
+                **Average quality score:** {stats.get('avg_quality_score', 0)}
+                **Average word count:** {stats.get('avg_word_count', 0)}
+                👉 **Next Step:** Check the Data Preview tab to review your dataset
+                """
+                return status, stats_html
+            else:
+                return "❌ No items passed processing filters", ""
+        def refresh_preview_handler(project):
+            """Refresh data preview"""
+            if not project:
+                return None, {}
+            preview_data = studio.get_data_preview()
+            stats = studio.get_data_statistics()
+            if preview_data:
+                # Convert to DataFrame format
+                df_data = []
+                for item in preview_data:
+                    df_data.append([
+                        item['title'][:50] + "..." if len(item['title']) > 50 else item['title'],
+                        item['content_preview'],
+                        item['word_count'],
+                        item['quality_score'],
+                        item['url'][:50] + "..." if len(item['url']) > 50 else item['url']
+                    ])
+                return df_data, stats
+            return None, {}
+        def export_dataset_handler(export_format, export_template, project):
+            """Handle dataset export"""
+            if not project:
+                return "❌ Please create a project first", None
+            if not studio.processed_items and not studio.scraped_items:
+                return "❌ No data to export. Please scrape and process data first.", None
+            try:
+                # Export dataset
+                filename = studio.export_dataset(export_template, export_format)
+                status = f"""
+                ✅ **Export Successful!**
+                **Format:** {export_format}
+                **Template:** {export_template.replace('_', ' ').title()}
+                **File:** {filename}
+                📥 **Download your dataset using the link below**
+                """
+                return status, filename
+            except Exception as e:
+                return f"❌ Export failed: {str(e)}", None
+        # Connect event handlers
+        create_project_btn.click(
+            fn=create_project,
+            inputs=[project_name, template_choice],
+            outputs=[project_status, project_state]
+        )
         scrape_btn.click(
+            fn=scrape_urls_handler,
+            inputs=[urls_input, urls_file, project_state],
+            outputs=[scraping_status, collection_stats]
+        )
+        process_btn.click(
+            fn=process_data_handler,
+            inputs=[clean_text, quality_filter, detect_language,
+                   add_sentiment, extract_entities, deduplicate, project_state],
+            outputs=[processing_status, processing_stats]
+        )
+        refresh_preview_btn.click(
+            fn=refresh_preview_handler,
+            inputs=[project_state],
+            outputs=[data_preview, dataset_stats]
         )
         export_btn.click(
+            fn=export_dataset_handler,
+            inputs=[export_format, export_template, project_state],
+            outputs=[export_status, export_file]
         )
+        # Auto-refresh preview when processing completes
+        processing_status.change(
+            fn=refresh_preview_handler,
+            inputs=[project_state],
+            outputs=[data_preview, dataset_stats]
         )
     return interface
 # Launch the application
 if __name__ == "__main__":
+    logger.info("🚀 Starting AI Dataset Studio...")
+    # Check available features
+    features = []
+    if HAS_TRANSFORMERS:
+        features.append("✅ AI Models")
+    else:
+        features.append("⚠️ Basic Processing")
+    if HAS_NLTK:
+        features.append("✅ Advanced NLP")
+    else:
+        features.append("⚠️ Basic NLP")
+    if HAS_DATASETS:
+        features.append("✅ HuggingFace Integration")
+    else:
+        features.append("⚠️ Standard Export Only")
+    logger.info(f"📊 Features: {' | '.join(features)}")
+    try:
+        interface = create_modern_interface()
+        logger.info("✅ Interface created successfully")
+        interface.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False,
+            show_error=True,
+            debug=False
+        )
+    except Exception as e:
+        logger.error(f"❌ Failed to launch application: {e}")
+        raise