Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Dec 3, 2024

Commit

ecc3973

verified ·

1 Parent(s): 98cf6a3

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -49

app.py CHANGED Viewed

@@ -14,44 +14,84 @@ import mimetypes
 from tqdm import tqdm
 import logging
 import gradio as gr
 # Setup logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # --- URL and File Processing Functions ---
-def fetch_content(url):
-    """Fetch content from a given URL."""
-    try:
-        response = requests.get(url, timeout=10)
-        response.raise_for_status()
-        return response.text
-    except requests.RequestException as e:
-        logger.error(f"Error fetching {url}: {e}")
-        return None
 def extract_text(html):
-    """Extract text from HTML content."""
     soup = BeautifulSoup(html, 'html.parser')
     for script in soup(["script", "style"]):
         script.decompose()
     text = soup.get_text()
     lines = (line.strip() for line in text.splitlines())
     chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    return '\n'.join(chunk for chunk in chunks if chunk)
 def process_urls(urls):
     """Process a list of URLs and return their extracted text."""
     dataset = []
     for url in tqdm(urls, desc="Fetching URLs"):
         html = fetch_content(url)
         if html:
             text = extract_text(html)
-            dataset.append({
-                "source": "url",
-                "url": url,
-                "content": text
-            })
         time.sleep(1)  # Be polite to the server
     return dataset
@@ -70,12 +110,16 @@ def process_file(file):
                     if mime_type and mime_type.startswith('text'):
                         with open(filepath, 'r', errors='ignore') as f:
                             content = f.read()
-                        dataset.append({
-                            "source": "file",
-                            "filename": filename,
-                            "content": content
-                        })
                     else:
                         dataset.append({
                             "source": "file",
                             "filename": filename,
@@ -85,12 +129,16 @@ def process_file(file):
             mime_type, _ = mimetypes.guess_type(file.name)
             if mime_type and mime_type.startswith('text'):
                 content = file.read().decode('utf-8', errors='ignore')
-                dataset.append({
-                    "source": "file",
-                    "filename": os.path.basename(file.name),
-                    "content": content
-                })
             else:
                 dataset.append({
                     "source": "file",
                     "filename": os.path.basename(file.name),
@@ -98,13 +146,6 @@ def process_file(file):
                 })
     return dataset
-def process_text(text):
-    """Process raw text input."""
-    return [{
-        "source": "text_input",
-        "content": text
-    }]
 def create_dataset(urls, file, text_input):
     """Create a combined dataset from URLs, uploaded files, and text input."""
     dataset = []
@@ -115,12 +156,16 @@ def create_dataset(urls, file, text_input):
     if text_input:
         dataset.extend(process_text(text_input))
     output_file = 'combined_dataset.json'
     with open(output_file, 'w') as f:
         json.dump(dataset, f, indent=2)
     return output_file
 # --- Model Training and Evaluation Functions ---
 class CustomDataset(torch.utils.data.Dataset):
     def __init__(self, data, tokenizer, max_length=512):
@@ -162,6 +207,12 @@ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_le
         model.to(device)
         dataset = CustomDataset(data, tokenizer, max_length=max_length)
         train_size = int(0.8 * len(dataset))
         val_size = len(dataset) - train_size
         train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
@@ -171,7 +222,8 @@ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_le
             num_train_epochs=epochs,
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
-            evaluation_strategy='epoch',
             learning_rate=learning_rate,
             save_steps=500,
             load_best_model_at_end=True,
@@ -248,30 +300,45 @@ def deploy_model(model, tokenizer):
 # Gradio Interface
 def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
-    dataset_file = create_dataset(urls, file, text_input)
-    with open(dataset_file, 'r') as f:
-        dataset = json.load(f)
-    model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
-    deploy_model(model, tokenizer)
-    return dataset_file
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
-        gr.Textbox(lines=5, label="Enter comma-separated URLs"),
         gr.File(label="Upload file (including zip files)", type="filepath"),
-        gr.Textbox(lines=10, label="Enter or paste large text"),
         gr.Textbox(label="Model name", value="distilbert-base-uncased"),
-        gr.Number(label="Batch size", value=8),
-        gr.Number(label="Epochs", value=3),
     ],
- outputs=gr.File(label="Download Combined Dataset"),
     title="Dataset Creation and Model Training",
     description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
 )
 # Launch the interface

 from tqdm import tqdm
 import logging
 import gradio as gr
+import requests
+from bs4 import BeautifulSoup
 # Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # --- URL and File Processing Functions ---
+def fetch_content(url, retries=3):
+    """Fetch content from a given URL with retries on failure.
+    Args:
+        url (str): The URL to fetch content from.
+        retries (int): Number of retries in case of failure.
+    Returns:
+        str: The HTML content of the page, or None if an error occurred.
+    """
+    for attempt in range(retries):
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            logger.info(f"Successfully fetched content from {url}")
+            return response.text
+        except requests.RequestException as e:
+            logger.error(f"Error fetching {url} (attempt {attempt + 1}/{retries}): {e}")
+            if attempt == retries - 1:
+                return None
 def extract_text(html):
+    """Extract text from HTML content, removing scripts and styles.
+    Args:
+        html (str): The HTML content to extract text from.
+    Returns:
+        str: The extracted text, or an empty string if the input is invalid.
+    """
+    if not html:
+        logger.warning("Empty HTML content provided for extraction.")
+        return ""
     soup = BeautifulSoup(html, 'html.parser')
+    # Remove script and style elements
     for script in soup(["script", "style"]):
         script.decompose()
+    # Get text and clean it up
     text = soup.get_text()
     lines = (line.strip() for line in text.splitlines())
     chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    # Join non-empty chunks
+    extracted_text = '\n'.join(chunk for chunk in chunks if chunk)
+    logger.info("Text extraction completed.")
+    return extracted_text
 def process_urls(urls):
     """Process a list of URLs and return their extracted text."""
     dataset = []
     for url in tqdm(urls, desc="Fetching URLs"):
+        if not url.startswith("http://") and not url.startswith("https://"):
+            logger.warning(f"Invalid URL format: {url}")
+            continue  # Skip invalid URLs
         html = fetch_content(url)
         if html:
             text = extract_text(html)
+            if text:  # Check if text was extracted
+                dataset.append({
+                    "source": "url",
+                    "url": url,
+                    "content": text
+                })
+            else:
+                logger.warning(f"No text extracted from {url}")
+        else:
+            logger.error(f"Failed to fetch content from {url}")
         time.sleep(1)  # Be polite to the server
     return dataset
                     if mime_type and mime_type.startswith('text'):
                         with open(filepath, 'r', errors='ignore') as f:
                             content = f.read()
+                        if content.strip():  # Check if content is not empty
+                            dataset.append({
+                                "source": "file",
+                                "filename": filename,
+                                "content": content
+                            })
+                        else:
+                            logger.warning(f"File {filename} is empty.")
                     else:
+                        logger.warning(f"File {filename} is not a text file.")
                         dataset.append({
                             "source": "file",
                             "filename": filename,
             mime_type, _ = mimetypes.guess_type(file.name)
             if mime_type and mime_type.startswith('text'):
                 content = file.read().decode('utf-8', errors='ignore')
+                if content.strip():  # Check if content is not empty
+                    dataset.append({
+                        "source": "file",
+                        "filename": os.path.basename(file.name),
+                        "content": content
+                    })
+                else:
+                    logger.warning(f"Uploaded file {file.name} is empty.")
             else:
+                logger.warning(f"Uploaded file {file.name} is not a text file.")
                 dataset.append({
                     "source": "file",
                     "filename": os.path.basename(file.name),
                 })
     return dataset
 def create_dataset(urls, file, text_input):
     """Create a combined dataset from URLs, uploaded files, and text input."""
     dataset = []
     if text_input:
         dataset.extend(process_text(text_input))
+    # Log the contents of the dataset
+    logger.info(f"Dataset created with {len(dataset)} entries.")
+    for entry in dataset:
+        logger.debug(f"Entry: {entry}")
     output_file = 'combined_dataset.json'
     with open(output_file, 'w') as f:
         json.dump(dataset, f, indent=2)
     return output_file
 # --- Model Training and Evaluation Functions ---
 class CustomDataset(torch.utils.data.Dataset):
     def __init__(self, data, tokenizer, max_length=512):
         model.to(device)
         dataset = CustomDataset(data, tokenizer, max_length=max_length)
+        # Check if dataset is empty
+        if len(dataset) == 0:
+            logger.error("The dataset is empty. Please check the input data.")
+            return None, None
         train_size = int(0.8 * len(dataset))
         val_size = len(dataset) - train_size
         train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
             num_train_epochs=epochs,
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
+            evaluation_strategy='epoch',  # Set evaluation strategy
+            save_strategy='epoch',  # Ensure save strategy matches evaluation strategy
             learning_rate=learning_rate,
             save_steps=500,
             load_best_model_at_end=True,
 # Gradio Interface
 def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
+    try:
+        # Create the dataset from the provided inputs
+        dataset_file = create_dataset(urls, file, text_input)
+        # Load the dataset
+        with open(dataset_file, 'r') as f:
+            dataset = json.load(f)
+        # Check if the dataset is empty
+        if not dataset:
+            return "Error: The dataset is empty. Please check your inputs."
+        # Train the model
+        model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
+        # Deploy the model
+        deploy_model(model, tokenizer)
+        return dataset_file
+    except Exception as e:
+        logger.error(f"Error in gradio_interface: {e}")
+        return f"An error occurred: {str(e)}"
+# Gradio Interface Setup
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
+        gr.Textbox(lines=5, label="Enter comma-separated URLs", placeholder="http://example.com, https://example.org"),
         gr.File(label="Upload file (including zip files)", type="filepath"),
+        gr.Textbox(lines=10, label="Enter or paste large text", placeholder="Your text here..."),
         gr.Textbox(label="Model name", value="distilbert-base-uncased"),
+        gr.Number(label="Batch size", value=8, precision=0, step=1),
+        gr.Number(label="Epochs", value=3, precision=0, step=1),
     ],
+    outputs=gr.File(label="Download Combined Dataset"),
     title="Dataset Creation and Model Training",
     description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
+    theme="default",  # You can change the theme if desired
 )
 # Launch the interface