Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Dec 5, 2024

Commit

10bd6bb

verified ·

1 Parent(s): c04c74a

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -73

app.py CHANGED Viewed

@@ -14,8 +14,6 @@ import mimetypes
 from tqdm import tqdm
 import logging
 import gradio as gr
-import requests
-from bs4 import BeautifulSoup
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -23,15 +21,6 @@ logger = logging.getLogger(__name__)
 # --- URL and File Processing Functions ---
 def fetch_content(url, retries=3):
-    """Fetch content from a given URL with retries on failure.
-    Args:
-        url (str): The URL to fetch content from.
-        retries (int): Number of retries in case of failure.
-    Returns:
-        str: The HTML content of the page, or None if an error occurred.
-    """
     for attempt in range(retries):
         try:
             response = requests.get(url, timeout=10)
@@ -44,65 +33,46 @@ def fetch_content(url, retries=3):
                 return None
 def extract_text(html):
-    """Extract text from HTML content, removing scripts and styles.
-    Args:
-        html (str): The HTML content to extract text from.
-    Returns:
-        str: The extracted text, or an empty string if the input is invalid.
-    """
     if not html:
         logger.warning("Empty HTML content provided for extraction.")
         return ""
     soup = BeautifulSoup(html, 'html.parser')
-    # Remove script and style elements
     for script in soup(["script", "style"]):
         script.decompose()
-    # Get text and clean it up
     text = soup.get_text()
     lines = (line.strip() for line in text.splitlines())
     chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-    # Join non-empty chunks
     extracted_text = '\n'.join(chunk for chunk in chunks if chunk)
     logger.info("Text extraction completed.")
     return extracted_text
 def process_urls(urls):
-    """Process a list of URLs and return their extracted text."""
     dataset = []
     for url in tqdm(urls, desc="Fetching URLs"):
         if not url.startswith("http://") and not url.startswith("https://"):
             logger.warning(f"Invalid URL format: {url}")
-            continue  # Skip invalid URLs
         html = fetch_content(url)
         if html:
             text = extract_text(html)
-            if text:  # Check if text was extracted
-                dataset.append({
-                    "source": "url",
-                    "url": url,
-                    "content": text
-                })
             else:
                 logger.warning(f"No text extracted from {url}")
         else:
             logger.error(f"Failed to fetch content from {url}")
-        time.sleep(1)  # Be polite to the server
     return dataset
 def process_file(file):
-    """Process uploaded files (including zip files) and extract text."""
     dataset = []
     with tempfile.TemporaryDirectory() as temp_dir:
         if zipfile.is_zipfile(file.name):
             with zipfile.ZipFile(file.name, 'r') as zip_ref:
                 zip_ref.extractall(temp_dir)
-            # Process each extracted file
             for root, _, files in os.walk(temp_dir):
                 for filename in files:
                     filepath = os.path.join(root, filename)
@@ -110,62 +80,42 @@ def process_file(file):
                     if mime_type and mime_type.startswith('text'):
                         with open(filepath, 'r', errors='ignore') as f:
                             content = f.read()
-                        if content.strip():  # Check if content is not empty
-                            dataset.append({
-                                "source": "file",
-                                "filename": filename,
-                                "content": content
-                            })
                         else:
                             logger.warning(f"File {filename} is empty.")
                     else:
                         logger.warning(f"File {filename} is not a text file.")
-                        dataset.append({
-                            "source": "file",
-                            "filename": filename,
-                            "content": "Binary file - content not extracted"
-                        })
         else:
             mime_type, _ = mimetypes.guess_type(file.name)
             if mime_type and mime_type.startswith('text'):
                 content = file.read().decode('utf-8', errors='ignore')
-                if content.strip():  # Check if content is not empty
-                    dataset.append({
-                        "source": "file",
-                        "filename": os.path.basename(file.name),
-                        "content": content
-                    })
                 else:
                     logger.warning(f"Uploaded file {file.name} is empty.")
             else:
                 logger.warning(f"Uploaded file {file.name} is not a text file.")
-                dataset.append({
-                    "source": "file",
-                    "filename": os.path.basename(file.name),
-                    "content": "Binary file - content not extracted"
-                })
     return dataset
 def create_dataset(urls, file, text_input):
-    """Create a combined dataset from URLs, uploaded files, and text input."""
     dataset = []
     if urls:
         dataset.extend(process_urls([url.strip() for url in urls.split(',') if url.strip()]))
     if file:
         dataset.extend(process_file(file))
     if text_input:
-        dataset.extend(process_text(text_input))
-    # Log the contents of the dataset
     logger.info(f"Dataset created with {len(dataset)} entries.")
-    for entry in dataset:
-        logger.debug(f"Entry: {entry}")
     output_file = 'combined_dataset.json'
     with open(output_file, 'w') as f:
         json.dump(dataset, f, indent=2)
     return output_file
 # --- Model Training and Evaluation Functions ---
 class CustomDataset(torch.utils.data.Dataset):
     def __init__(self, data, tokenizer, max_length=512):
@@ -178,7 +128,7 @@ class CustomDataset(torch.utils.data.Dataset):
     def __getitem__(self, idx):
         try:
-            text = self.data[idx]['content ']
             label = self.data[idx].get('label', 0)
             encoding = self.tokenizer.encode_plus(
@@ -208,7 +158,6 @@ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_le
         dataset = CustomDataset(data, tokenizer, max_length=max_length)
-        # Check if dataset is empty
         if len(dataset) == 0:
             logger.error("The dataset is empty. Please check the input data.")
             return None, None
@@ -222,8 +171,8 @@ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_le
             num_train_epochs=epochs,
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
-            eval_strategy='epoch',  # Set evaluation strategy
-            save_strategy='epoch',  # Ensure save strategy matches evaluation strategy
             learning_rate=learning_rate,
             save_steps=500,
             load_best_model_at_end=True,
@@ -301,21 +250,16 @@ def deploy_model(model, tokenizer):
 # Gradio Interface
 def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
     try:
-        # Create the dataset from the provided inputs
         dataset_file = create_dataset(urls, file, text_input)
-        # Load the dataset
         with open(dataset_file, 'r') as f:
             dataset = json.load(f)
-        # Check if the dataset is empty
         if not dataset:
             return "Error: The dataset is empty. Please check your inputs."
-        # Train the model
         model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
-        # Deploy the model
         deploy_model(model, tokenizer)
         return dataset_file
@@ -338,7 +282,7 @@ iface = gr.Interface(
     outputs=gr.File(label="Download Combined Dataset"),
     title="Dataset Creation and Model Training",
     description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
-    theme="default",  # You can change the theme if desired
 )
 # Launch the interface

 from tqdm import tqdm
 import logging
 import gradio as gr
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- URL and File Processing Functions ---
 def fetch_content(url, retries=3):
     for attempt in range(retries):
         try:
             response = requests.get(url, timeout=10)
                 return None
 def extract_text(html):
     if not html:
         logger.warning("Empty HTML content provided for extraction.")
         return ""
     soup = BeautifulSoup(html, 'html.parser')
     for script in soup(["script", "style"]):
         script.decompose()
     text = soup.get_text()
     lines = (line.strip() for line in text.splitlines())
     chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
     extracted_text = '\n'.join(chunk for chunk in chunks if chunk)
     logger.info("Text extraction completed.")
     return extracted_text
 def process_urls(urls):
     dataset = []
     for url in tqdm(urls, desc="Fetching URLs"):
         if not url.startswith("http://") and not url.startswith("https://"):
             logger.warning(f"Invalid URL format: {url}")
+            continue
         html = fetch_content(url)
         if html:
             text = extract_text(html)
+            if text:
+                dataset.append({"source": "url", "url": url, "content": text})
             else:
                 logger.warning(f"No text extracted from {url}")
         else:
             logger.error(f"Failed to fetch content from {url}")
+        time.sleep(1)
     return dataset
 def process_file(file):
     dataset = []
     with tempfile.TemporaryDirectory() as temp_dir:
         if zipfile.is_zipfile(file.name):
             with zipfile.ZipFile(file.name, 'r') as zip_ref:
                 zip_ref.extractall(temp_dir)
             for root, _, files in os.walk(temp_dir):
                 for filename in files:
                     filepath = os.path.join(root, filename)
                     if mime_type and mime_type.startswith('text'):
                         with open(filepath, 'r', errors='ignore') as f:
                             content = f.read()
+                        if content.strip():
+                            dataset.append({"source": "file", "filename": filename, "content": content})
                         else:
                             logger.warning(f"File {filename} is empty.")
                     else:
                         logger.warning(f"File {filename} is not a text file.")
+                        dataset.append({"source": "file", "filename": filename, "content": "Binary file - content not extracted"})
         else:
             mime_type, _ = mimetypes.guess_type(file.name)
             if mime_type and mime_type.startswith('text'):
                 content = file.read().decode('utf-8', errors='ignore')
+                if content.strip():
+                    dataset.append({"source": "file", "filename": os.path.basename(file.name), "content": content})
                 else:
                     logger.warning(f"Uploaded file {file.name} is empty.")
             else:
                 logger.warning(f"Uploaded file {file.name} is not a text file.")
+                dataset.append({"source": "file", "filename": os.path.basename(file.name), "content": "Binary file - content not extracted"})
     return dataset
 def create_dataset(urls, file, text_input):
     dataset = []
     if urls:
         dataset.extend(process_urls([url.strip() for url in urls.split(',') if url.strip()]))
     if file:
         dataset.extend(process_file(file))
     if text_input:
+        dataset.append({"source": "input", "content": text_input})
     logger.info(f"Dataset created with {len(dataset)} entries.")
     output_file = 'combined_dataset.json'
     with open(output_file, 'w') as f:
         json.dump(dataset, f, indent=2)
     return output_file
 # --- Model Training and Evaluation Functions ---
 class CustomDataset(torch.utils.data.Dataset):
     def __init__(self, data, tokenizer, max_length=512):
     def __getitem__(self, idx):
         try:
+            text = self.data[idx]['content']  # Fixed the key to 'content'
             label = self.data[idx].get('label', 0)
             encoding = self.tokenizer.encode_plus(
         dataset = CustomDataset(data, tokenizer, max_length=max_length)
         if len(dataset) == 0:
             logger.error("The dataset is empty. Please check the input data.")
             return None, None
             num_train_epochs=epochs,
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
+            eval_strategy='epoch',
+            save_strategy='epoch',
             learning_rate=learning_rate,
             save_steps=500,
             load_best_model_at_end=True,
 # Gradio Interface
 def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
     try:
         dataset_file = create_dataset(urls, file, text_input)
         with open(dataset_file, 'r') as f:
             dataset = json.load(f)
         if not dataset:
             return "Error: The dataset is empty. Please check your inputs."
         model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
         deploy_model(model, tokenizer)
         return dataset_file
     outputs=gr.File(label="Download Combined Dataset"),
     title="Dataset Creation and Model Training",
     description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
+    theme="default",
 )
 # Launch the interface