import logging from typing import Dict, List, Optional, Any from datetime import datetime from pydantic import BaseModel, Field from app.schemas.dataset_common import ImpactLevel, DatasetMetrics # Log for this module log = logging.getLogger(__name__) # Supported strategies for dataset combination SUPPORTED_STRATEGIES = ["merge", "intersect", "filter"] class ImpactAssessment(BaseModel): dataset_id: str = Field(..., description="The ID of the dataset being assessed") impact_level: ImpactLevel = Field(..., description="The impact level: low, medium, or high") assessment_method: str = Field( "unknown", description="Method used to determine impact level (e.g., size_based, downloads_and_likes_based)" ) metrics: DatasetMetrics = Field( ..., description="Metrics used for impact assessment" ) thresholds: Dict[str, Dict[str, str]] = Field( {}, description="Thresholds used for determining impact levels (for reference)" ) class DatasetInfo(BaseModel): id: str impact_level: Optional[ImpactLevel] = None impact_assessment: Optional[Dict] = None # Add other fields as needed class Config: extra = "allow" # Allow extra fields from the API class DatasetBase(BaseModel): name: str description: Optional[str] = None tags: Optional[List[str]] = None class DatasetCreate(DatasetBase): files: Optional[List[str]] = None class DatasetUpdate(DatasetBase): name: Optional[str] = None # Make fields optional for updates class Dataset(DatasetBase): id: int # or str depending on your ID format owner_id: str # Assuming user IDs are strings created_at: Optional[str] = None updated_at: Optional[str] = None class Config: pass # Removed orm_mode = True since ORM is not used class DatasetCombineRequest(BaseModel): source_datasets: List[str] = Field(..., description="List of dataset IDs to combine") name: str = Field(..., description="Name for the combined dataset") description: Optional[str] = Field(None, description="Description for the combined dataset") combination_strategy: str = Field("merge", description="Strategy to use when combining datasets (e.g., 'merge', 'intersect', 'filter')") filter_criteria: Optional[Dict[str, Any]] = Field(None, description="Criteria for filtering when combining datasets") class CombinedDataset(BaseModel): id: str = Field(..., description="ID of the combined dataset") name: str = Field(..., description="Name of the combined dataset") description: Optional[str] = Field(None, description="Description of the combined dataset") source_datasets: List[str] = Field(..., description="IDs of the source datasets") created_at: datetime = Field(..., description="Creation timestamp") created_by: str = Field(..., description="ID of the user who created this combined dataset") impact_level: Optional[ImpactLevel] = Field(None, description="Calculated impact level of the combined dataset") status: str = Field("processing", description="Status of the dataset combination process") combination_strategy: str = Field(..., description="Strategy used when combining datasets") metrics: Optional[DatasetMetrics] = Field(None, description="Metrics for the combined dataset") storage_bucket_id: Optional[str] = Field(None, description="ID of the storage bucket containing dataset files") storage_folder_path: Optional[str] = Field(None, description="Path to the dataset files within the bucket") class Config: extra = "allow" # Allow extra fields for flexibility __all__ = ["ImpactLevel", "ImpactAssessment", "DatasetInfo", "DatasetMetrics", "Dataset", "DatasetCreate", "DatasetUpdate", "DatasetCombineRequest", "CombinedDataset"]