Spaces:

Tastycoole
/

frugal_ai_challenge

Sleeping

App Files Files Community

Tastycoole commited on Jan 19

Commit

b7b2f27

1 Parent(s): c3ed611

test base api

Browse files

Files changed (12) hide show

.gitignore +17 -0
Dockerfile +16 -0
README.md +67 -9
app.py +27 -0
requirements.txt +10 -0
tasks/__init__.py +0 -0
tasks/audio.py +88 -0
tasks/image.py +172 -0
tasks/text.py +92 -0
tasks/utils/__init__.py +0 -0
tasks/utils/emissions.py +28 -0
tasks/utils/evaluation.py +18 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,17 @@

+.ipynb_checkpoints/sandbox-checkpoint.ipynb
+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/
+emissions.csv

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,13 +1,71 @@
 ---
-title: Frugal Ai Challenge
-emoji: 🌍
-colorFrom: purple
-colorTo: gray
-sdk: streamlit
-sdk_version: 1.41.1
-app_file: app.py
 pinned: false
-short_description: test API
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Submission Template
+emoji: 🔥
+colorFrom: yellow
+colorTo: green
+sdk: docker
 pinned: false
 ---
+# Random Baseline Model for Climate Disinformation Classification
+## Model Description
+This is a random baseline model for the Frugal AI Challenge 2024, specifically for the text classification task of identifying climate disinformation. The model serves as a performance floor, randomly assigning labels to text inputs without any learning.
+### Intended Use
+- **Primary intended uses**: Baseline comparison for climate disinformation classification models
+- **Primary intended users**: Researchers and developers participating in the Frugal AI Challenge
+- **Out-of-scope use cases**: Not intended for production use or real-world classification tasks
+## Training Data
+The model uses the QuotaClimat/frugalaichallenge-text-train dataset:
+- Size: ~6000 examples
+- Split: 80% train, 20% test
+- 8 categories of climate disinformation claims
+### Labels
+0. No relevant claim detected
+1. Global warming is not happening
+2. Not caused by humans
+3. Not bad or beneficial
+4. Solutions harmful/unnecessary
+5. Science is unreliable
+6. Proponents are biased
+7. Fossil fuels are needed
+## Performance
+### Metrics
+- **Accuracy**: ~12.5% (random chance with 8 classes)
+- **Environmental Impact**:
+  - Emissions tracked in gCO2eq
+  - Energy consumption tracked in Wh
+### Model Architecture
+The model implements a random choice between the 8 possible labels, serving as the simplest possible baseline.
+## Environmental Impact
+Environmental impact is tracked using CodeCarbon, measuring:
+- Carbon emissions during inference
+- Energy consumption during inference
+This tracking helps establish a baseline for the environmental impact of model deployment and inference.
+## Limitations
+- Makes completely random predictions
+- No learning or pattern recognition
+- No consideration of input text
+- Serves only as a baseline reference
+- Not suitable for any real-world applications
+## Ethical Considerations
+- Dataset contains sensitive topics related to climate disinformation
+- Model makes random predictions and should not be used for actual classification
+- Environmental impact is tracked to promote awareness of AI's carbon footprint
+```

app.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from fastapi import FastAPI
+from dotenv import load_dotenv
+from tasks import text, image, audio
+# Load environment variables
+load_dotenv()
+app = FastAPI(
+    title="Frugal AI Challenge API",
+    description="API for the Frugal AI Challenge evaluation endpoints"
+)
+# Include all routers
+app.include_router(text.router)
+app.include_router(image.router)
+app.include_router(audio.router)
+@app.get("/")
+async def root():
+    return {
+        "message": "Welcome to the Frugal AI Challenge API",
+        "endpoints": {
+            "text": "/text - Text classification task",
+            "image": "/image - Image classification task (coming soon)",
+            "audio": "/audio - Audio classification task (coming soon)"
+        }
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi>=0.68.0
+uvicorn>=0.15.0
+codecarbon>=2.3.1
+datasets>=2.14.0
+scikit-learn>=1.0.2
+pydantic>=1.10.0
+python-dotenv>=1.0.0
+gradio>=4.0.0
+requests>=2.31.0
+librosa==0.10.2.post1

tasks/__init__.py ADDED Viewed

File without changes

tasks/audio.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from fastapi import APIRouter
+from datetime import datetime
+from datasets import load_dataset
+from sklearn.metrics import accuracy_score
+import random
+import os
+from .utils.evaluation import AudioEvaluationRequest
+from .utils.emissions import tracker, clean_emissions_data, get_space_info
+from dotenv import load_dotenv
+load_dotenv()
+router = APIRouter()
+DESCRIPTION = "Random Baseline"
+ROUTE = "/audio"
+@router.post(ROUTE, tags=["Audio Task"],
+             description=DESCRIPTION)
+async def evaluate_audio(request: AudioEvaluationRequest):
+    """
+    Evaluate audio classification for rainforest sound detection.
+    Current Model: Random Baseline
+    - Makes random predictions from the label space (0-1)
+    - Used as a baseline for comparison
+    """
+    # Get space info
+    username, space_url = get_space_info()
+    # Define the label mapping
+    LABEL_MAPPING = {
+        "chainsaw": 0,
+        "environment": 1
+    }
+    # Load and prepare the dataset
+    # Because the dataset is gated, we need to use the HF_TOKEN environment variable to authenticate
+    dataset = load_dataset(request.dataset_name,token=os.getenv("HF_TOKEN"))
+    # Split dataset
+    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
+    test_dataset = train_test["test"]
+    # Start tracking emissions
+    tracker.start()
+    tracker.start_task("inference")
+    #--------------------------------------------------------------------------------------------
+    # YOUR MODEL INFERENCE CODE HERE
+    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
+    #--------------------------------------------------------------------------------------------
+    # Make random predictions (placeholder for actual model inference)
+    true_labels = test_dataset["label"]
+    predictions = [random.randint(0, 1) for _ in range(len(true_labels))]
+    #--------------------------------------------------------------------------------------------
+    # YOUR MODEL INFERENCE STOPS HERE
+    #--------------------------------------------------------------------------------------------
+    # Stop tracking emissions
+    emissions_data = tracker.stop_task()
+    # Calculate accuracy
+    accuracy = accuracy_score(true_labels, predictions)
+    # Prepare results dictionary
+    results = {
+        "username": username,
+        "space_url": space_url,
+        "submission_timestamp": datetime.now().isoformat(),
+        "model_description": DESCRIPTION,
+        "accuracy": float(accuracy),
+        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
+        "emissions_gco2eq": emissions_data.emissions * 1000,
+        "emissions_data": clean_emissions_data(emissions_data),
+        "api_route": ROUTE,
+        "dataset_config": {
+            "dataset_name": request.dataset_name,
+            "test_size": request.test_size,
+            "test_seed": request.test_seed
+        }
+    }
+    return results

tasks/image.py ADDED Viewed

	@@ -0,0 +1,172 @@

+from fastapi import APIRouter
+from datetime import datetime
+from datasets import load_dataset
+import numpy as np
+from sklearn.metrics import accuracy_score
+import random
+import os
+from .utils.evaluation import ImageEvaluationRequest
+from .utils.emissions import tracker, clean_emissions_data, get_space_info
+from dotenv import load_dotenv
+load_dotenv()
+router = APIRouter()
+DESCRIPTION = "Random Baseline"
+ROUTE = "/image"
+def parse_boxes(annotation_string):
+    """Parse multiple boxes from a single annotation string.
+    Each box has 5 values: class_id, x_center, y_center, width, height"""
+    values = [float(x) for x in annotation_string.strip().split()]
+    boxes = []
+    # Each box has 5 values
+    for i in range(0, len(values), 5):
+        if i + 5 <= len(values):
+            # Skip class_id (first value) and take the next 4 values
+            box = values[i+1:i+5]
+            boxes.append(box)
+    return boxes
+def compute_iou(box1, box2):
+    """Compute Intersection over Union (IoU) between two YOLO format boxes."""
+    # Convert YOLO format (x_center, y_center, width, height) to corners
+    def yolo_to_corners(box):
+        x_center, y_center, width, height = box
+        x1 = x_center - width/2
+        y1 = y_center - height/2
+        x2 = x_center + width/2
+        y2 = y_center + height/2
+        return np.array([x1, y1, x2, y2])
+    box1_corners = yolo_to_corners(box1)
+    box2_corners = yolo_to_corners(box2)
+    # Calculate intersection
+    x1 = max(box1_corners[0], box2_corners[0])
+    y1 = max(box1_corners[1], box2_corners[1])
+    x2 = min(box1_corners[2], box2_corners[2])
+    y2 = min(box1_corners[3], box2_corners[3])
+    intersection = max(0, x2 - x1) * max(0, y2 - y1)
+    # Calculate union
+    box1_area = (box1_corners[2] - box1_corners[0]) * (box1_corners[3] - box1_corners[1])
+    box2_area = (box2_corners[2] - box2_corners[0]) * (box2_corners[3] - box2_corners[1])
+    union = box1_area + box2_area - intersection
+    return intersection / (union + 1e-6)
+def compute_max_iou(true_boxes, pred_box):
+    """Compute maximum IoU between a predicted box and all true boxes"""
+    max_iou = 0
+    for true_box in true_boxes:
+        iou = compute_iou(true_box, pred_box)
+        max_iou = max(max_iou, iou)
+    return max_iou
+@router.post(ROUTE, tags=["Image Task"],
+             description=DESCRIPTION)
+async def evaluate_image(request: ImageEvaluationRequest):
+    """
+    Evaluate image classification and object detection for forest fire smoke.
+    Current Model: Random Baseline
+    - Makes random predictions for both classification and bounding boxes
+    - Used as a baseline for comparison
+    Metrics:
+    - Classification accuracy: Whether an image contains smoke or not
+    - Object Detection accuracy: IoU (Intersection over Union) for smoke bounding boxes
+    """
+    # Get space info
+    username, space_url = get_space_info()
+    # Load and prepare the dataset
+    dataset = load_dataset(request.dataset_name, token=os.getenv("HF_TOKEN"))
+    # Split dataset
+    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
+    test_dataset = train_test["test"]
+    # Start tracking emissions
+    tracker.start()
+    tracker.start_task("inference")
+    #--------------------------------------------------------------------------------------------
+    # YOUR MODEL INFERENCE CODE HERE
+    # Update the code below to replace the random baseline with your model inference
+    #--------------------------------------------------------------------------------------------
+    predictions = []
+    true_labels = []
+    pred_boxes = []
+    true_boxes_list = []  # List of lists, each inner list contains boxes for one image
+    for example in test_dataset:
+        # Parse true annotation (YOLO format: class_id x_center y_center width height)
+        annotation = example.get("annotations", "").strip()
+        has_smoke = len(annotation) > 0
+        true_labels.append(int(has_smoke))
+        # Make random classification prediction
+        pred_has_smoke = random.random() > 0.5
+        predictions.append(int(pred_has_smoke))
+        # If there's a true box, parse it and make random box prediction
+        if has_smoke:
+            # Parse all true boxes from the annotation
+            image_true_boxes = parse_boxes(annotation)
+            true_boxes_list.append(image_true_boxes)
+            # For baseline, make one random box prediction per image
+            # In a real model, you might want to predict multiple boxes
+            random_box = [
+                random.random(),  # x_center
+                random.random(),  # y_center
+                random.random() * 0.5,  # width (max 0.5)
+                random.random() * 0.5   # height (max 0.5)
+            ]
+            pred_boxes.append(random_box)
+    #--------------------------------------------------------------------------------------------
+    # YOUR MODEL INFERENCE STOPS HERE
+    #--------------------------------------------------------------------------------------------
+    # Stop tracking emissions
+    emissions_data = tracker.stop_task()
+    # Calculate classification accuracy
+    classification_accuracy = accuracy_score(true_labels, predictions)
+    # Calculate mean IoU for object detection (only for images with smoke)
+    # For each image, we compute the max IoU between the predicted box and all true boxes
+    ious = []
+    for true_boxes, pred_box in zip(true_boxes_list, pred_boxes):
+        max_iou = compute_max_iou(true_boxes, pred_box)
+        ious.append(max_iou)
+    mean_iou = float(np.mean(ious)) if ious else 0.0
+    # Prepare results dictionary
+    results = {
+        "username": username,
+        "space_url": space_url,
+        "submission_timestamp": datetime.now().isoformat(),
+        "model_description": DESCRIPTION,
+        "classification_accuracy": float(classification_accuracy),
+        "mean_iou": mean_iou,
+        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
+        "emissions_gco2eq": emissions_data.emissions * 1000,
+        "emissions_data": clean_emissions_data(emissions_data),
+        "api_route": ROUTE,
+        "dataset_config": {
+            "dataset_name": request.dataset_name,
+            "test_size": request.test_size,
+            "test_seed": request.test_seed
+        }
+    }
+    return results

tasks/text.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from fastapi import APIRouter
+from datetime import datetime
+from datasets import load_dataset
+from sklearn.metrics import accuracy_score
+import random
+from .utils.evaluation import TextEvaluationRequest
+from .utils.emissions import tracker, clean_emissions_data, get_space_info
+router = APIRouter()
+DESCRIPTION = "Random Baseline"
+ROUTE = "/text"
+@router.post(ROUTE, tags=["Text Task"],
+             description=DESCRIPTION)
+async def evaluate_text(request: TextEvaluationRequest):
+    """
+    Evaluate text classification for climate disinformation detection.
+    Current Model: Random Baseline
+    - Makes random predictions from the label space (0-7)
+    - Used as a baseline for comparison
+    """
+    # Get space info
+    username, space_url = get_space_info()
+    # Define the label mapping
+    LABEL_MAPPING = {
+        "0_not_relevant": 0,
+        "1_not_happening": 1,
+        "2_not_human": 2,
+        "3_not_bad": 3,
+        "4_solutions_harmful_unnecessary": 4,
+        "5_science_unreliable": 5,
+        "6_proponents_biased": 6,
+        "7_fossil_fuels_needed": 7
+    }
+    # Load and prepare the dataset
+    dataset = load_dataset(request.dataset_name)
+    # Convert string labels to integers
+    dataset = dataset.map(lambda x: {"label": LABEL_MAPPING[x["label"]]})
+    # Split dataset
+    train_test = dataset["train"].train_test_split(test_size=request.test_size, seed=request.test_seed)
+    test_dataset = train_test["test"]
+    # Start tracking emissions
+    tracker.start()
+    tracker.start_task("inference")
+    #--------------------------------------------------------------------------------------------
+    # YOUR MODEL INFERENCE CODE HERE
+    # Update the code below to replace the random baseline by your model inference within the inference pass where the energy consumption and emissions are tracked.
+    #--------------------------------------------------------------------------------------------
+    # Make random predictions (placeholder for actual model inference)
+    true_labels = test_dataset["label"]
+    predictions = [random.randint(0, 7) for _ in range(len(true_labels))]
+    #--------------------------------------------------------------------------------------------
+    # YOUR MODEL INFERENCE STOPS HERE
+    #--------------------------------------------------------------------------------------------
+    # Stop tracking emissions
+    emissions_data = tracker.stop_task()
+    # Calculate accuracy
+    accuracy = accuracy_score(true_labels, predictions)
+    # Prepare results dictionary
+    results = {
+        "username": username,
+        "space_url": space_url,
+        "submission_timestamp": datetime.now().isoformat(),
+        "model_description": DESCRIPTION,
+        "accuracy": float(accuracy),
+        "energy_consumed_wh": emissions_data.energy_consumed * 1000,
+        "emissions_gco2eq": emissions_data.emissions * 1000,
+        "emissions_data": clean_emissions_data(emissions_data),
+        "api_route": ROUTE,
+        "dataset_config": {
+            "dataset_name": request.dataset_name,
+            "test_size": request.test_size,
+            "test_seed": request.test_seed
+        }
+    }
+    return results

tasks/utils/__init__.py ADDED Viewed

File without changes

tasks/utils/emissions.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from codecarbon import EmissionsTracker
+import os
+# Initialize tracker
+tracker = EmissionsTracker(allow_multiple_runs=True)
+class EmissionsData:
+    def __init__(self, energy_consumed: float, emissions: float):
+        self.energy_consumed = energy_consumed
+        self.emissions = emissions
+def clean_emissions_data(emissions_data):
+    """Remove unwanted fields from emissions data"""
+    data_dict = emissions_data.__dict__
+    fields_to_remove = ['timestamp', 'project_name', 'experiment_id', 'latitude', 'longitude']
+    return {k: v for k, v in data_dict.items() if k not in fields_to_remove}
+def get_space_info():
+    """Get the space username and URL from environment variables"""
+    space_name = os.getenv("SPACE_ID", "")
+    if space_name:
+        try:
+            username = space_name.split("/")[0]
+            space_url = f"https://huggingface.co/spaces/{space_name}"
+            return username, space_url
+        except Exception as e:
+            print(f"Error getting space info: {e}")
+    return "local-user", "local-development"

tasks/utils/evaluation.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing import Optional
+from pydantic import BaseModel, Field
+class BaseEvaluationRequest(BaseModel):
+    test_size: float = Field(0.2, ge=0.0, le=1.0, description="Size of the test split (between 0 and 1)")
+    test_seed: int = Field(42, ge=0, description="Random seed for reproducibility")
+class TextEvaluationRequest(BaseEvaluationRequest):
+    dataset_name: str = Field("QuotaClimat/frugalaichallenge-text-train",
+                            description="The name of the dataset on HuggingFace Hub")
+class ImageEvaluationRequest(BaseEvaluationRequest):
+    dataset_name: str = Field("pyronear/pyro-sdis",
+                            description="The name of the dataset on HuggingFace Hub")
+class AudioEvaluationRequest(BaseEvaluationRequest):
+    dataset_name: str = Field("rfcx/frugalai",
+                            description="The name of the dataset on HuggingFace Hub")