Infini-d-set

Running

App Files Files Community

acecalisto3 commited on 16 days ago

Commit

303a80b

verified ·

1 Parent(s): af05e7c

Update app.py

Browse files

Files changed (1) hide show

app.py +218 -27

app.py CHANGED Viewed

@@ -2,7 +2,10 @@ import io
 import os
 import re
 import time
-from typing import Any, Dict, List
 from itertools import islice
 from functools import partial
 from multiprocessing.pool import ThreadPool
@@ -209,6 +212,28 @@ with gr.Blocks(css=css) as demo:
                     gr.Markdown("Save datasets to your account")
                     gr.LoginButton()
                     select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Select user or organization", visible=False)
                     gr.Markdown("Save datasets as public or private datasets")
                     visibility_radio = gr.Radio(["public", "private"], value="public", container=False, interactive=False)
     with gr.Column(visible=False) as dataset_page:
@@ -411,35 +436,193 @@ with gr.Blocks(css=css) as demo:
         ]
-    def refine_data_generic(dataset: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
-        Universally refine any dataset.
-        Works on list of dicts. Detects field types and applies general cleanup.
         """
-        def normalize_value(value):
-            if isinstance(value, str):
-                # Trim, title-case common descriptors, remove duplicate whitespace
-                value = re.sub(r'\s+', ' ', value.strip())
-                value = value.replace('_', ' ')
-                if any(k in value.lower() for k in ['color', 'material', 'type', 'status']):
-                    value = value.title()
             return value
-        def clean_record(record):
             cleaned = {}
             for key, value in record.items():
-                # Normalize key and value
                 clean_key = key.strip().lower().replace(" ", "_")
                 if isinstance(value, list):
-                    cleaned[clean_key] = [normalize_value(v) for v in value]
                 elif isinstance(value, dict):
                     cleaned[clean_key] = clean_record(value)
                 else:
-                    cleaned[clean_key] = normalize_value(value)
             return cleaned
         return [clean_record(entry) for entry in dataset]
     def detect_anomalies(record: Dict[str, Any]) -> List[str]:
         """
         Detect potential anomalies in a record.
@@ -621,20 +804,16 @@ with gr.Blocks(css=css) as demo:
         return gr.Column(visible=True), gr.Column(visible=False)
-    @generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, select_namespace_dropdown, visibility_radio], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
-    def generate_full_dataset(title, content, search_query, namespace, visability):
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
         dataset_name, tags = dataset_name.strip(), tags.strip()
         csv_header, preview_df = parse_preview_df(content)
-        # Remove dummy "id" columns
-        for column_name, values in preview_df.to_dict(orient="series").items():
-            try:
-                if [int(v) for v in values] == list(range(len(preview_df))):
-                    preview_df = preview_df.drop(columns=column_name)
-                if [int(v) for v in values] == list(range(1, len(preview_df) + 1)):
-                    preview_df = preview_df.drop(columns=column_name)
-            except Exception:
-                pass
         columns = list(preview_df)
         output: list[Optional[dict]] = [None] * NUM_ROWS
         output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
@@ -688,7 +867,19 @@ with gr.Blocks(css=css) as demo:
             visible=True,
         )
-    @demo.load(outputs=show_dataset_outputs + button_groups + buttons + [generated_texts_state] + [select_namespace_dropdown, visibility_radio])
     def load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
         if oauth_token:
             user_info = whoami(oauth_token.token)

 import os
 import re
 import time
+import requests
+from typing import Any, Dict, List, Optional, Set, Union
+from difflib import get_close_matches
+from pathlib import Path
 from itertools import islice
 from functools import partial
 from multiprocessing.pool import ThreadPool
                     gr.Markdown("Save datasets to your account")
                     gr.LoginButton()
                     select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Select user or organization", visible=False)
+                    gr.Markdown("Dataset Refinement Settings")
+                    refinement_mode = gr.Radio(
+                        ["sourceless", "sourced"],
+                        value="sourceless",
+                        label="Refinement Mode",
+                        info="Choose between AI-only refinement or source-based refinement"
+                    )
+                    with gr.Group(visible=False) as source_group:
+                        source_type = gr.Dropdown(
+                            choices=["csv_url", "xlsx_url", "local_csv", "local_xlsx"],
+                            value="csv_url",
+                            label="Source Type"
+                        )
+                        source_path = gr.Textbox(
+                            label="Source Path/URL",
+                            placeholder="Enter URL or local file path"
+                        )
+                        load_source_button = gr.Button("Load Source")
+                        source_status = gr.Markdown("")
                     gr.Markdown("Save datasets as public or private datasets")
                     visibility_radio = gr.Radio(["public", "private"], value="public", container=False, interactive=False)
     with gr.Column(visible=False) as dataset_page:
         ]
+    # Knowledge base storage
+    class KnowledgeBase:
+        def __init__(self):
+            self.materials: Set[str] = {'Metal', 'Wood', 'Plastic', 'Aluminum', 'Bronze', 'Steel', 'Glass', 'Leather', 'Fabric'}
+            self.colors: Set[str] = {'Red', 'Black', 'White', 'Silver', 'Bronze', 'Yellow', 'Blue', 'Green', 'Gray', 'Brown'}
+            self.patterns: Dict[str, List[str]] = {}
+            self.source_data: Dict[str, Any] = {}
+        def load_source(self, source_type: str, source_path: str) -> None:
+            """Load data from various sources into the knowledge base"""
+            try:
+                if source_type == 'csv_url':
+                    response = requests.get(source_path)
+                    df = pd.read_csv(io.StringIO(response.text))
+                elif source_type == 'xlsx_url':
+                    response = requests.get(source_path)
+                    df = pd.read_excel(io.BytesIO(response.content))
+                elif source_type == 'local_csv':
+                    df = pd.read_csv(source_path)
+                elif source_type == 'local_xlsx':
+                    df = pd.read_excel(source_path)
+                else:
+                    raise ValueError(f"Unsupported source type: {source_type}")
+                # Extract patterns and common values
+                self._extract_knowledge(df)
+                # Store source data
+                self.source_data[source_path] = df.to_dict('records')
+            except Exception as e:
+                print(f"Error loading source {source_path}: {str(e)}")
+        def _extract_knowledge(self, df: pd.DataFrame) -> None:
+            """Extract patterns and common values from dataframe"""
+            for column in df.columns:
+                if 'material' in column.lower():
+                    values = df[column].dropna().unique()
+                    self.materials.update(v.title() for v in values if isinstance(v, str))
+                elif 'color' in column.lower():
+                    values = df[column].dropna().unique()
+                    self.colors.update(v.title() for v in values if isinstance(v, str))
+                # Store column patterns
+                if df[column].dtype == 'object':
+                    patterns = df[column].dropna().astype(str).tolist()
+                    self.patterns[column] = patterns
+        def get_closest_match(self, value: str, field_type: str) -> Optional[str]:
+            """Find closest match from known values"""
+            if field_type == 'material':
+                matches = get_close_matches(value.title(), list(self.materials), n=1, cutoff=0.8)
+            elif field_type == 'color':
+                matches = get_close_matches(value.title(), list(self.colors), n=1, cutoff=0.8)
+            else:
+                return None
+            return matches[0] if matches else None
+    # Initialize knowledge base
+    knowledge_base = KnowledgeBase()
+    def refine_data_generic(dataset: List[Dict[str, Any]], mode: str = 'sourceless', kb: Optional[KnowledgeBase] = None) -> List[Dict[str, Any]]:
         """
+        Enhanced universal dataset refinement with source-aware and sourceless modes.
+        Args:
+            dataset: List of dictionary records
+            mode: 'sourceless' or 'sourced'
+            knowledge_base: Optional reference data for sourced mode
         """
+        def split_compound_field(field: str) -> List[str]:
+            """Split compound fields like materialwear into separate values"""
+            parts = re.split(r'[,;\n]+', field)
+            parts = [part.strip().title() for part in parts if part.strip()]
+            return list(set(parts))  # Remove duplicates
+        def normalize_value(value: Any, field_name: str) -> Any:
+            """Smart value normalization with field context"""
+            if not isinstance(value, str):
+                return value
+            # Basic cleanup
+            value = re.sub(r'\s+', ' ', value.strip())
+            value = value.replace('_', ' ')
+            # Field-specific processing with knowledge base
+            if any(term in field_name.lower() for term in ['material']):
+                parts = split_compound_field(value)
+                if mode == 'sourced' and kb:
+                    known = [kb.get_closest_match(p, 'material') or p.title() for p in parts]
+                else:
+                    known = [m for m in parts if m in kb.materials] if kb else parts
+                if known:
+                    return known[0] if len(known) == 1 else known
+                return value.title()
+            if any(term in field_name.lower() for term in ['color']):
+                parts = split_compound_field(value)
+                if mode == 'sourced' and kb:
+                    known = [kb.get_closest_match(p, 'color') or p.title() for p in parts]
+                else:
+                    known = [c for c in parts if c in kb.colors] if kb else parts
+                if known:
+                    return known[0] if len(known) == 1 else known
+                return value.title()
+            if any(term in field_name.lower() for term in ['date', 'time']):
+                # Add date normalization logic here
+                return value
+            # Default titlecase for descriptive fields
+            if any(term in field_name.lower() for term in ['type', 'status', 'category', 'description']):
+                return value.title()
             return value
+        def clean_record(record: Dict[str, Any]) -> Dict[str, Any]:
+            """Enhanced record cleaning with compound field detection"""
             cleaned = {}
+            compound_fields = {}
+            # First pass: Basic cleaning and compound field detection
             for key, value in record.items():
                 clean_key = key.strip().lower().replace(" ", "_")
+                # Handle compound fields (e.g., materialwear)
+                if isinstance(value, str):
+                    for material in COMMON_MATERIALS:
+                        if material.lower() in value.lower():
+                            compound_fields[clean_key] = value
+                            break
                 if isinstance(value, list):
+                    cleaned[clean_key] = [normalize_value(v, clean_key) for v in value]
                 elif isinstance(value, dict):
                     cleaned[clean_key] = clean_record(value)
                 else:
+                    cleaned[clean_key] = normalize_value(value, clean_key)
+            # Second pass: Split compound fields
+            for key, value in compound_fields.items():
+                parts = split_compound_field(value)
+                materials = [p for p in parts if p in COMMON_MATERIALS]
+                if materials:
+                    cleaned['material'] = materials[0] if len(materials) == 1 else materials
+                    # Store remaining info in wear/condition field
+                    remaining = [p for p in parts if p not in materials]
+                    if remaining:
+                        cleaned['condition'] = ' '.join(remaining)
             return cleaned
+        # Use knowledge base patterns in sourced mode
+        if mode == 'sourced' and kb and kb.patterns:
+            for record in dataset:
+                for field, patterns in kb.patterns.items():
+                    if field in record:
+                        value = str(record[field])
+                        matches = get_close_matches(value, patterns, n=1, cutoff=0.8)
+                        if matches:
+                            record[field] = matches[0]
         return [clean_record(entry) for entry in dataset]
+    def refine_preview_data(df: pd.DataFrame, mode: str = 'sourceless') -> pd.DataFrame:
+        """Refine preview data with the selected mode"""
+        # Remove dummy "id" columns first
+        for column_name, values in df.to_dict(orient="series").items():
+            try:
+                if [int(v) for v in values] == list(range(len(df))):
+                    df = df.drop(columns=column_name)
+                if [int(v) for v in values] == list(range(1, len(df) + 1)):
+                    df = df.drop(columns=column_name)
+            except Exception:
+                pass
+        # Convert to records for refinement
+        records = df.to_dict('records')
+        # Apply refinement with current mode and knowledge base
+        refined_records = refine_data_generic(records, mode=mode, kb=knowledge_base)
+        # Convert back to DataFrame
+        refined_df = pd.DataFrame(refined_records)
+        return refined_df
     def detect_anomalies(record: Dict[str, Any]) -> List[str]:
         """
         Detect potential anomalies in a record.
         return gr.Column(visible=True), gr.Column(visible=False)
+    @generate_full_dataset_button.click(
+        inputs=[dataset_title, dataset_content, search_bar, select_namespace_dropdown, visibility_radio, refinement_mode],
+        outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button]
+    )
+    def generate_full_dataset(title, content, search_query, namespace, visibility, mode):
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
         dataset_name, tags = dataset_name.strip(), tags.strip()
         csv_header, preview_df = parse_preview_df(content)
+        # Clean and refine the preview data
+        preview_df = refine_preview_data(preview_df, mode)
         columns = list(preview_df)
         output: list[Optional[dict]] = [None] * NUM_ROWS
         output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
             visible=True,
         )
+    @refinement_mode.change(outputs=[source_group])
+    def toggle_source_group(mode):
+        return gr.Group(visible=(mode == "sourced"))
+    @load_source_button.click(inputs=[source_type, source_path], outputs=[source_status])
+    def load_knowledge_source(source_type, source_path):
+        try:
+            knowledge_base.load_source(source_type, source_path)
+            return gr.Markdown("✅ Source loaded successfully", visible=True)
+        except Exception as e:
+            return gr.Markdown(f"❌ Error loading source: {str(e)}", visible=True)
+    @demo.load(outputs=show_dataset_outputs + button_groups + buttons + [generated_texts_state] + [select_namespace_dropdown, visibility_radio, source_group])
     def load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
         if oauth_token:
             user_info = whoami(oauth_token.token)