Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Nov 23, 2024

Commit

def429d

verified ·

1 Parent(s): efc2a65

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -15

app.py CHANGED Viewed

@@ -1,14 +1,25 @@
-import gradio as gr
-import requests
-from bs4 import BeautifulSoup
 import json
-import time
-import zipfile
 import os
 import tempfile
 import mimetypes
 from tqdm import tqdm
 def fetch_content(url):
     """Fetch content from a given URL."""
     try:
@@ -16,7 +27,7 @@ def fetch_content(url):
         response.raise_for_status()
         return response.text
     except requests.RequestException as e:
-        print(f"Error fetching {url}: {e}")
         return None
 def extract_text(html):
@@ -65,7 +76,6 @@ def process_file(file):
                             "content": content
                         })
                     else:
-                        # For non-text files, just store the filename
                         dataset.append({
                             "source": "file",
                             "filename": filename,
@@ -81,7 +91,6 @@ def process_file(file):
                     "content": content
                 })
             else:
-                # For non-text files, just store the filename
                 dataset.append({
                     "source": "file",
                     "filename": os.path.basename(file.name),
@@ -106,28 +115,168 @@ def create_dataset(urls, file, text_input):
     if text_input:
         dataset.extend(process_text(text_input))
-    # Save the dataset as JSON
     output_file = 'combined_dataset.json'
     with open(output_file, 'w') as f:
         json.dump(dataset, f, indent=2)
     return output_file
 # Gradio Interface
-def gradio_interface(urls, file, text_input):
-    return create_dataset(urls, file, text_input)
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
         gr.Textbox(lines=5, label="Enter comma-separated URLs"),
         gr.File(label="Upload file (including zip files)"),
-        gr.Textbox(lines=10, label="Enter or paste large text")
     ],
     outputs=gr.File(label="Download Combined Dataset"),
-    title="URL, File, and Text to Dataset Converter",
-    description="Enter URLs, upload files (including zip files), and/or paste text to create a combined dataset for AI training.",
 )
 # Launch the interface
-iface.launch()

 import json
 import os
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from sklearn.metrics import accuracy_score
+from torch.utils.data import DataLoader
+from transformers import Trainer, TrainingArguments
+import time
+import requests
+from bs4 import BeautifulSoup
 import tempfile
+import zipfile
 import mimetypes
 from tqdm import tqdm
+import logging
+import gradio as gr
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# --- URL and File Processing Functions ---
 def fetch_content(url):
     """Fetch content from a given URL."""
     try:
         response.raise_for_status()
         return response.text
     except requests.RequestException as e:
+        logger.error(f"Error fetching {url}: {e}")
         return None
 def extract_text(html):
                             "content": content
                         })
                     else:
                         dataset.append({
                             "source": "file",
                             "filename": filename,
                     "content": content
                 })
             else:
                 dataset.append({
                     "source": "file",
                     "filename": os.path.basename(file.name),
     if text_input:
         dataset.extend(process_text(text_input))
     output_file = 'combined_dataset.json'
     with open(output_file, 'w') as f:
         json.dump(dataset, f, indent=2)
     return output_file
+# --- Model Training and Evaluation Functions ---
+class CustomDataset(torch.utils.data.Dataset):
+    def __init__(self, data, tokenizer, max_length=512):
+        self.data = data
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        try:
+            text = self.data[idx]['content']
+            label = self.data[idx].get('label', 0)
+            encoding = self.tokenizer.encode_plus(
+                text,
+                max_length=self.max_length,
+                padding='max_length',
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors='pt',
+            )
+            return {
+                'input_ids': encoding['input_ids'].squeeze(),
+                'attention_mask': encoding['attention_mask'].squeeze(),
+                'labels': torch.tensor(label, dtype=torch.long)
+            }
+        except Exception as e:
+            logger.error(f"Error in processing item {idx}: {e}")
+            raise
+def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_length=512):
+    try:
+        model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        model.to(device)
+        dataset = CustomDataset(data, tokenizer, max_length=max_length)
+        train_size = int(0.8 * len(dataset))
+        val_size = len(dataset) - train_size
+        train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
+        training_args = TrainingArguments(
+            output_dir='./results',
+            num_train_epochs=epochs,
+            per_device_train_batch_size=batch_size,
+            per_device_eval_batch_size=batch_size,
+            evaluation_strategy='epoch',
+            learning_rate=learning_rate,
+            save_steps=500,
+            load_best_model_at_end=True,
+            metric_for_best_model='accuracy',
+            greater_is_better=True,
+            save_total_limit=2,
+            seed=42,
+            dataloader_num_workers=4,
+            fp16=torch.cuda.is_available()
+        )
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=val_dataset,
+            compute_metrics=lambda pred: {
+                'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1))
+            }
+        )
+        logger.info("Starting model training...")
+        start_time = time.time()
+        trainer.train()
+        end_time = time.time()
+        logger.info(f'Training time: {end_time - start_time:.2f} seconds')
+        logger.info("Evaluating model...")
+        eval_result = trainer.evaluate()
+        logger.info(f'Evaluation result: {eval_result}')
+        trainer.save_model('./model')
+        return model, tokenizer
+    except Exception as e:
+        logger.error(f"Error during training: {e}")
+        raise
+def deploy_model(model, tokenizer):
+    try:
+        model.save_pretrained('./model')
+        tokenizer.save_pretrained('./model')
+        deployment_script = f'''
+        import torch
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
+        model = AutoModelForSequenceClassification.from_pretrained('./model')
+        tokenizer = AutoTokenizer.from_pretrained('./model')
+        def predict(text):
+            encoding = tokenizer.encode_plus(
+                text,
+                max_length=512,
+                padding='max_length',
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors='pt',
+            )
+            input_ids = encoding['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
+            attention_mask = encoding['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
+            outputs = model(input_ids, attention_mask=attention_mask)
+            logits = outputs.logits
+            return torch.argmax(logits, dim=1).cpu().numpy()[0]
+        '''
+        with open('./deployment.py', 'w') as f:
+            f.write(deployment_script)
+        logger.info('Model deployed successfully. To use the model, run: python deployment.py')
+    except Exception as e:
+        logger.error(f"Error deploying model: {e}")
+        raise
 # Gradio Interface
+def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
+    dataset_file = create_dataset(urls, file, text_input)
+    with open(dataset_file, 'r') as f:
+        dataset = json.load(f)
+    model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
+    deploy_model(model, tokenizer)
+    return dataset_file
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
         gr.Textbox(lines=5, label="Enter comma-separated URLs"),
         gr.File(label="Upload file (including zip files)"),
+        gr.Textbox(lines=10, label="Enter or paste large text"),
+        gr.Textbox(label="Model name", value="distilbert-base-uncased"),
+        gr.Number(label="Batch size", value=8),
+        gr.Number(label="Epochs", value=3),
     ],
     outputs=gr.File(label="Download Combined Dataset"),
+    title="Dataset Creation and Model Training",
+    description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
 )
 # Launch the interface
+if __name__ == "__main__":
+    iface.launch()