Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Dec 6, 2024

Commit

c1f083f

verified ·

1 Parent(s): 83b1e97

Update app.py

Browse files

Files changed (1) hide show

app.py +306 -358

app.py CHANGED Viewed

@@ -1,11 +1,7 @@
 import json
 import os
 import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from sklearn.metrics import accuracy_score
-from torch.utils.data import DataLoader
-from transformers import Trainer, TrainingArguments
-import time
 import requests
 from bs4 import BeautifulSoup
 import tempfile
@@ -14,401 +10,353 @@ import mimetypes
 from tqdm import tqdm
 import logging
 import gradio as gr
-from typing import List, Dict
-# Setup logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# --- URL and File Processing Functions ---
-def fetch_content(url, retries=3):
-    for attempt in range(retries):
         try:
-            response = requests.get(url, timeout=10)
-            response.raise_for_status()
-            logger.info(f"Successfully fetched content from {url}")
-            return response.text
-        except requests.RequestException as e:
-            logger.error(f"Error fetching {url} (attempt {attempt + 1}/{retries}): {e}")
-            if attempt == retries - 1:
-                return None
-def extract_text(html):
     if not html:
-        logger.warning("Empty HTML content provided for extraction.")
         return ""
     soup = BeautifulSoup(html, 'html.parser')
-    for script in soup(["script", "style"]):
-        script.decompose()
-    text = soup.get_text()
     lines = (line.strip() for line in text.splitlines())
     chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    extracted_text = '\n'.join(chunk for chunk in chunks if chunk)
-    logger.info("Text extraction completed.")
-    return extracted_text
-def process_urls(urls):
-    dataset = []
-    for url in tqdm(urls, desc="Fetching URLs"):
-        if not url.startswith("http://") and not url.startswith("https://"):
-            logger.warning(f"Invalid URL format: {url}")
-            continue
-        html = fetch_content(url)
-        if html:
-            text = extract_text(html)
-            if text:
-                dataset.append({"source": "url", "url": url, "content": text})
-            else:
-                logger.warning(f"No text extracted from {url}")
-        else:
-            logger.error(f"Failed to fetch content from {url}")
-        time.sleep(1)
-    return dataset
 def preprocess_bulk_text(text: str) -> str:
-    """
-    Preprocess bulk text input by adding commas between logical separations.
-    Handles line breaks, slashes, and domain endings.
-    """
-    # First, normalize line endings
     text = text.replace('\r\n', '\n').replace('\r', '\n')
-    # Split by common separators
-    separators = [
-        '\n',      # Line breaks
-        ' / ',     # Forward slashes with spaces
-        '/',       # Forward slashes
-        ';',       # Semicolons
-        ' - ',     # Dashes with spaces
-        '|',       # Vertical bars
-        '  '       # Double spaces
-    ]
     # Replace separators with commas if not already comma-separated
     if ',' not in text:
         for separator in separators:
             text = text.replace(separator, ',')
-        # Handle domain endings (e.g., .com .org .net)
-        import re
         domain_pattern = r'(\.[a-z]{2,})\s+'
         text = re.sub(domain_pattern, r'\1,', text)
-        # Clean up multiple commas
         text = re.sub(r',+', ',', text)
-        # Remove leading/trailing commas and whitespace
         text = text.strip(',' + string.whitespace)
-        # Ensure proper spacing around commas
         text = re.sub(r'\s*,\s*', ', ', text)
     return text
-def process_inputs(urls, file, text_input, model_name, batch_size, epochs):
-    try:
-        # Log the input parameters for debugging
-        logger.info("Processing inputs with the following parameters:")
-        logger.info(f"URLs: {urls}")
-        logger.info(f"File: {file}")
-        logger.info(f"Text Input: {text_input}")
-        logger.info(f"Model Name: {model_name}")
-        logger.info(f"Batch Size: {batch_size}")
-        logger.info(f"Epochs: {epochs}")
-        # Validate inputs
-        if not urls and not file and not text_input:
-            logger.error("No input data provided. Please provide at least one of URLs, file, or text input.")
-            return "Error: No input data provided."
-        # Create dataset or perform any processing logic you need
-        output_file = create_dataset(urls, file, text_input, model_name, batch_size, epochs)
-        # Log the successful creation of the dataset
-        logger.info(f"Dataset created successfully: {output_file}")
-        return output_file  # Return the output file for download
-    except Exception as e:
-        logger.error(f"An error occurred while processing inputs: {e}")
-        return f"Error: {str(e)}"  # Return error message for user feedback
-# Assuming process_btn is a Gradio button
-process_btn.click(
-    fn=process_inputs,
-    inputs=[
-        urls_input,
-        file_input,
-        text_input,
-        model_name,
-        batch_size,
-        epochs
-    ],
-    outputs=download_output
-)
-def process_file(file):
-    dataset = []
-    with tempfile.TemporaryDirectory() as temp_dir:
-        if zipfile.is_zipfile(file.name):
-            with zipfile.ZipFile(file.name, 'r') as zip_ref:
-                zip_ref.extractall(temp_dir)
-            for root, _, files in os.walk(temp_dir):
-                for filename in files:
-                    filepath = os.path.join(root, filename)
-                    mime_type, _ = mimetypes.guess_type(filepath)
-                    if mime_type and mime_type.startswith('text'):
-                        with open(filepath, 'r', errors='ignore') as f:
-                            content = f.read()
-                        if content.strip():
-                            dataset.append({"source": "file", "filename": filename, "content": content})
-                        else:
-                            logger.warning(f"File {filename} is empty.")
-                    else:
-                        logger.warning(f"File {filename} is not a text file.")
-                        dataset.append({"source": "file", "filename": filename, "content": "Binary file - content not extracted"})
-        else:
-            mime_type, _ = mimetypes.guess_type(file.name)
-            if mime_type and mime_type.startswith('text'):
-                content = file.read().decode('utf-8', errors='ignore')
-                if content.strip():
-                    dataset.append({"source": "file", "filename": os.path.basename(file.name), "content": content})
-                else:
-                    logger.warning(f"Uploaded file {file.name} is empty.")
-            else:
-                logger.warning(f"Uploaded file {file.name} is not a text file.")
-                dataset.append({"source": "file", "filename": os.path.basename(file.name), "content": "Binary file - content not extracted"})
-    return dataset
-def create_dataset(urls, file, text_input):
-    dataset = []
-    if urls:
-        dataset.extend(process_urls([url.strip() for url in urls.split(',') if url.strip()]))
-    if file:
-        dataset.extend(process_file(file))
-    if text_input:
-        dataset.append({"source": "input", "content": text_input})
-    logger.info(f"Dataset created with {len(dataset)} entries.")
-    output_file = 'combined_dataset.json'
-    with open(output_file, 'w') as f:
-        json.dump(dataset, f, indent=2)
-    return output_file
-# --- Model Training and Evaluation Functions ---
-class CustomDataset(torch.utils.data.Dataset):
-    def __init__(self, data, tokenizer, max_length=512):
-        self.data = data
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        try:
-            text = self.data[idx]['content']  # Fixed the key to 'content'
-            label = self.data[idx].get('label', 0)
-            encoding = self.tokenizer.encode_plus(
-                text,
-                max_length=self.max_length,
-                padding='max_length',
-                truncation=True,
-                return_attention_mask=True,
-                return_tensors='pt',
-            )
-            return {
-                'input_ids': encoding['input_ids'].squeeze(),
-                'attention_mask': encoding['attention_mask'].squeeze(),
-                'labels': torch.tensor(label, dtype=torch.long)
-            }
-        except Exception as e:
-            logger.error(f"Error in processing item {idx}: {e}")
-            raise
-def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_length=2048):
-    try:
-        model = AutoModelForSequenceClassification.from_pretrained(model_name)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        model.to(device)
-        dataset = CustomDataset(data, tokenizer, max_length=max_length)
-        if len(dataset) == 0:
-            logger.error("The dataset is empty. Please check the input data.")
-            return None, None
-        train_size = int(0.8 * len(dataset))
-        val_size = len(dataset) - train_size
-        train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
-        training_args = TrainingArguments(
-            output_dir='./results',
-            num_train_epochs=epochs,
-            per_device_train_batch_size=batch_size,
-            per_device_eval_batch_size=batch_size,
-            eval_strategy='epoch',
-            save_strategy='epoch',
-            learning_rate=learning_rate,
-            save_steps=500,
-            load_best_model_at_end=True,
-            metric_for_best_model='accuracy',
-            greater_is_better=True,
-            save_total_limit=2,
-            seed=42,
-            dataloader_num_workers=4,
-            fp16=torch.cuda.is_available()
-        )
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=train_dataset,
-            eval_dataset=val_dataset,
-            compute_metrics=lambda pred: {
-                'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1))
-            }
-        )
-        logger.info("Starting model training...")
-        start_time = time.time()
-        trainer.train()
-        end_time = time.time()
-        logger.info(f'Training time: {end_time - start_time:.2f} seconds')
-        logger.info("Evaluating model...")
-        eval_result = trainer.evaluate()
-        logger.info(f'Evaluation result: {eval_result}')
-        trainer.save_model('./model')
-        return model, tokenizer
-    except Exception as e:
-        logger.error(f"Error during training: {e}")
-        raise
-def deploy_model(model, tokenizer):
-    try:
-        model.save_pretrained('./model')
-        tokenizer.save_pretrained('./model')
-        deployment_script = f'''
-        import torch
-        from transformers import AutoModelForSequenceClassification, AutoTokenizer
-        model = AutoModelForSequenceClassification.from_pretrained('./model')
-        tokenizer = AutoTokenizer.from_pretrained('./model')
-        def predict(text):
-            encoding = tokenizer.encode_plus(
-                text,
-                max_length=512,
-                padding='max_length',
-                truncation=True,
-                return_attention_mask=True,
-                return_tensors='pt',
             )
-            input_ids = encoding['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
-            attention_mask = encoding['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
-            outputs = model(input_ids, attention_mask=attention_mask)
-            logits = outputs.logits
-            return torch.argmax(logits, dim=1).cpu().numpy()[0]
-        '''
-        with open('./deployment.py', 'w') as f:
-            f.write(deployment_script)
-        logger.info('Model deployed successfully. To use the model, run: python deployment.py')
-    except Exception as e:
-        logger.error(f"Error deploying model: {e}")
-        raise
-def create_interface():
-    """Create and return the Gradio interface"""
-    with gr.Blocks(title="Dataset Creation and Model Training") as interface:
-        gr.Markdown("# Dataset Creation and Model Training")
-        gr.Markdown("Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.")
         with gr.Row():
-            with gr.Column():
-                # URL input with auto-separation
-                urls_input = gr.Textbox(
-                    lines=5,
-                    label="Enter URLs",
-                    placeholder="Enter URLs separated by line breaks, commas, or slashes"
-                )
-                # File upload
-                file_input = gr.File(
-                    label="Upload file (including zip files)",
-                    type="filepath"
-                )
-                # Large text input
-                text_input = gr.Textbox(
-                    lines=10,
-                    label="Enter or paste large text",
-                    placeholder="Your text here..."
-                )
-            with gr.Column():
-                # Model configuration
-                model_name = gr.Textbox(
-                    label="Model name",
-                    value="distilbert-base-uncased"
-                )
-                batch_size = gr.Number(
-                    label="Batch size",
-                    value=8,
-                    precision=0,
-                    step=1
-                )
-                epochs = gr.Number(
-                    label="Epochs",
-                    value=3,
-                    precision=0,
-                    step=1
-                )
-        # Process button and output
-        with gr.Row():
-            process_btn = gr.Button("Process and Train")
-            download_output = gr.File(label="Download Combined Dataset")
-        # Event handlers
         process_btn.click(
-            fn=create_interface,
-            inputs=[
-                urls_input,
-                file_input,
-                text_input,
-                model_name,
-                batch_size,
-                epochs
-            ],
-            outputs=download_output
         )
-        # Preview processed URLs
-        with gr.Row():
-            preview_btn = gr.Button("Preview Processed URLs")
-            preview_output = gr.JSON(label="Processed Items")
-        preview_btn.click(
-            fn=process_input,
-            inputs=[urls_input],
-            outputs=[preview_output]
-        )
     return interface
-# Launch the interface
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch()

 import json
 import os
 import torch
+import string
 import requests
 from bs4 import BeautifulSoup
 import tempfile
 from tqdm import tqdm
 import logging
 import gradio as gr
+from typing import List, Dict, Union, Optional
+from urllib.parse import urlparse
+import concurrent.futures
+import validators
+from pathlib import Path
+import re
+# Setup logging with more detailed configuration
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('app.log')
+    ]
+)
 logger = logging.getLogger(__name__)
+class URLProcessor:
+    """Class to handle URL processing with advanced features"""
+    def __init__(self, timeout: int = 10, max_retries: int = 3, concurrent_requests: int = 5):
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.concurrent_requests = concurrent_requests
+        self.session = requests.Session()
+        # Add common headers to mimic browser behavior
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        })
+    def validate_url(self, url: str) -> bool:
+        """Validate URL format and accessibility"""
         try:
+            result = urlparse(url)
+            return all([result.scheme, result.netloc]) and validators.url(url)
+        except Exception as e:
+            logger.warning(f"Invalid URL format: {url} - {str(e)}")
+            return False
+    def fetch_content(self, url: str) -> Optional[str]:
+        """Fetch content from URL with retry mechanism"""
+        for attempt in range(self.max_retries):
+            try:
+                response = self.session.get(url, timeout=self.timeout)
+                response.raise_for_status()
+                return response.text
+            except requests.RequestException as e:
+                logger.error(f"Attempt {attempt + 1}/{self.max_retries} failed for {url}: {str(e)}")
+                if attempt == self.max_retries - 1:
+                    return None
+            time.sleep(1)  # Delay between retries
+    def process_urls(self, urls: List[str]) -> List[Dict]:
+        """Process multiple URLs concurrently"""
+        valid_urls = [url for url in urls if self.validate_url(url)]
+        if not valid_urls:
+            logger.warning("No valid URLs to process")
+            return []
+        results = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrent_requests) as executor:
+            future_to_url = {executor.submit(self.fetch_content, url): url for url in valid_urls}
+            for future in concurrent.futures.as_completed(future_to_url):
+                url = future_to_url[future]
+                try:
+                    html = future.result()
+                    if html:
+                        text = extract_text(html)
+                        if text:
+                            results.append({
+                                "source": "url",
+                                "url": url,
+                                "content": text,
+                                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+                            })
+                        else:
+                            logger.warning(f"No text content extracted from {url}")
+                except Exception as e:
+                    logger.error(f"Error processing {url}: {str(e)}")
+        return results
+def extract_text(html: str) -> str:
+    """Enhanced text extraction with better cleaning"""
     if not html:
         return ""
     soup = BeautifulSoup(html, 'html.parser')
+    # Remove unwanted elements
+    for element in soup(['script', 'style', 'header', 'footer', 'nav']):
+        element.decompose()
+    # Extract text with better formatting
+    text = soup.get_text(separator=' ')
+    # Clean up the text
     lines = (line.strip() for line in text.splitlines())
     chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    text = ' '.join(chunk for chunk in chunks if chunk)
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+class FileProcessor:
+    """Class to handle file processing"""
+    def __init__(self, max_file_size: int = 10 * 1024 * 1024):  # 10MB default
+        self.max_file_size = max_file_size
+        self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
+    def is_text_file(self, filepath: str) -> bool:
+        """Check if file is a text file"""
+        try:
+            mime_type, _ = mimetypes.guess_type(filepath)
+            return mime_type and mime_type.startswith('text/')
+        except Exception:
+            return False
+    def process_file(self, file) -> List[Dict]:
+        """Process uploaded file with enhanced error handling"""
+        if not file:
+            return []
+        dataset = []
+        try:
+            file_size = os.path.getsize(file.name)
+            if file_size > self.max_file_size:
+                logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
+                return []
+            with tempfile.TemporaryDirectory() as temp_dir:
+                if zipfile.is_zipfile(file.name):
+                    dataset.extend(self._process_zip_file(file.name, temp_dir))
+                else:
+                    dataset.extend(self._process_single_file(file))
+        except Exception as e:
+            logger.error(f"Error processing file: {str(e)}")
+            return []
+        return dataset
+    def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
+        """Process ZIP file contents"""
+        results = []
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+            for root, _, files in os.walk(temp_dir):
+                for filename in files:
+                    filepath = os.path.join(root, filename)
+                    if self.is_text_file(filepath):
+                        try:
+                            with open(filepath, 'r', errors='ignore') as f:
+                                content = f.read()
+                            if content.strip():
+                                results.append({
+                                    "source": "file",
+                                    "filename": filename,
+                                    "content": content,
+                                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+                                })
+                        except Exception as e:
+                            logger.error(f"Error reading file {filename}: {str(e)}")
+        return results
+    def _process_single_file(self, file) -> List[Dict]:
+        """Process single file"""
+        results = []
+        try:
+            content = file.read().decode('utf-8', errors='ignore')
+            if content.strip():
+                results.append({
+                    "source": "file",
+                    "filename": os.path.basename(file.name),
+                    "content": content,
+                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+                })
+        except Exception as e:
+            logger.error(f"Error processing single file: {str(e)}")
+        return results
 def preprocess_bulk_text(text: str) -> str:
+    """Enhanced text preprocessing"""
+    if not text:
+        return ""
+    # Normalize line endings
     text = text.replace('\r\n', '\n').replace('\r', '\n')
+    # Define separators
+    separators = ['\n', ' / ', '/', ';', ' - ', '|', '  ']
     # Replace separators with commas if not already comma-separated
     if ',' not in text:
         for separator in separators:
             text = text.replace(separator, ',')
+        # Handle domain endings
         domain_pattern = r'(\.[a-z]{2,})\s+'
         text = re.sub(domain_pattern, r'\1,', text)
+        # Clean up multiple commas and whitespace
         text = re.sub(r',+', ',', text)
         text = text.strip(',' + string.whitespace)
         text = re.sub(r'\s*,\s*', ', ', text)
     return text
+def create_interface():
+    """Create enhanced Gradio interface"""
+    # Custom CSS for better styling
+    custom_css = """
+    .container { max-width: 1200px; margin: auto; padding: 20px; }
+    .output-panel { margin-top: 20px; }
+    .warning { color: #856404; background-color: #fff3cd; padding: 10px; border-radius: 4px; }
+    .error { color: #721c24; background-color: #f8d7da; padding: 10px; border-radius: 4px; }
+    """
+    with gr.Blocks(css=custom_css) as interface:
+        gr.Markdown("# Advanced URL and Text Processing Tool")
+        with gr.Tab("URL Input"):
+            url_input = gr.Textbox(
+                label="Enter URLs (comma-separated or one per line)",
+                placeholder="https://example1.com, https://example2.com",
+                lines=5
             )
+        with gr.Tab("File Input"):
+            file_input = gr.File(
+                label="Upload text file or ZIP archive",
+                file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
+            )
+        with gr.Tab("Text Input"):
+            text_input = gr.Textbox(
+                label="Enter text directly",
+                placeholder="Enter your text here...",
+                lines=5
+            )
+        # Process button with loading state
+        process_btn = gr.Button("Process Input", variant="primary")
+        # Output components
         with gr.Row():
+            output_file = gr.File(label="Processed Dataset")
+            output_text = gr.Textbox(
+                label="Processing Results",
+                lines=3,
+                interactive=False
+            )
+        def process_all_inputs(urls, file, text):
+            """Process all input types with progress tracking"""
+            try:
+                dataset = []
+                # Process URLs
+                if urls:
+                    url_processor = URLProcessor()
+                    url_list = [u.strip() for u in urls.split(',') if u.strip()]
+                    dataset.extend(url_processor.process_urls(url_list))
+                # Process files
+                if file:
+                    file_processor = FileProcessor()
+                    dataset.extend(file_processor.process_file(file))
+                # Process text input
+                if text:
+                    processed_text = preprocess_bulk_text(text)
+                    if processed_text:
+                        dataset.append({
+                            "source": "input",
+                            "content": processed_text,
+                            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
+                        })
+                if not dataset:
+                    return [None, "No valid data to process. Please check your inputs."]
+                # Save results
+                output_file = 'processed_dataset.json'
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    json.dump(dataset, f, indent=2, ensure_ascii=False)
+                # Generate summary
+                summary = f"""
+                Processing completed successfully!
+                - URLs processed: {sum(1 for d in dataset if d['source'] == 'url')}
+                - Files processed: {sum(1 for d in dataset if d['source'] == 'file')}
+                - Text inputs processed: {sum(1 for d in dataset if d['source'] == 'input')}
+                """
+                return [output_file, summary]
+            except Exception as e:
+                error_msg = f"Error during processing: {str(e)}"
+                logger.error(error_msg)
+                return [None, error_msg]
+        # Connect the interface
         process_btn.click(
+            fn=process_all_inputs,
+            inputs=[url_input, file_input, text_input],
+            outputs=[output_file, output_text]
         )
+        # Add comprehensive instructions
+        gr.Markdown("""
+        ## Instructions
+        1. **URL Input**:
+           - Enter URLs separated by commas or new lines
+           - URLs must start with http:// or https://
+           - Invalid URLs will be skipped
+        2. **File Input**:
+           - Upload text files or ZIP archives
+           - Supported formats: .txt, .zip, .md, .csv, .json, .xml
+           - Maximum file size: 10MB
+        3. **Text Input**:
+           - Directly enter or paste text
+           - Text will be automatically formatted
+        4. Click 'Process Input' to generate the dataset
+        The tool will combine all valid inputs into a single JSON dataset file.
+        """)
     return interface
 if __name__ == "__main__":
+    # Initialize mimetypes
+    mimetypes.init()
+    # Create and launch the interface
+    interface = create_interface()
+    interface.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        debug=True
+    )