Spaces:

bwingenroth
/

detect-language

Sleeping

App Files Files Community

bwingenroth commited on 26 days ago

Commit

2c69943

verified ·

1 Parent(s): 68581ce

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -95

app.py CHANGED Viewed

@@ -5,15 +5,11 @@ from typing import Iterator, Union, Any
 import fasttext
 import gradio as gr
 from dotenv import load_dotenv
-from httpx import Client, Timeout
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import logging
 from toolz import concat, groupby, valmap
-from fastapi import FastAPI
-from httpx import AsyncClient
 from pathlib import Path
-app = FastAPI()
 logger = logging.get_logger(__name__)
 load_dotenv()
@@ -23,7 +19,6 @@ def load_model(repo_id: str) -> fasttext.FastText._FastText:
     model_path = hf_hub_download(repo_id, filename="model.bin")
     return fasttext.load_model(model_path)
 def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterator[str]:
     for row in rows:
         if isinstance(row, str):
@@ -42,10 +37,9 @@ def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterat
             except TypeError:
                 continue
 FASTTEXT_PREFIX_LENGTH = 9  # fasttext labels are formatted like "__label__eng_Latn"
-# model = load_model(DEFAULT_FAST_TEXT_MODEL)
 Path("code/models").mkdir(parents=True, exist_ok=True)
 model = fasttext.load_model(
     hf_hub_download(
@@ -57,7 +51,6 @@ model = fasttext.load_model(
     )
 )
 def model_predict(inputs: str, k=1) -> list[dict[str, float]]:
     predictions = model.predict(inputs, k=k)
     return [
@@ -65,103 +58,163 @@ def model_predict(inputs: str, k=1) -> list[dict[str, float]]:
         for label, prob in zip(predictions[0], predictions[1])
     ]
 def get_label(x):
     return x.get("label")
 def get_mean_score(preds):
     return mean([pred.get("score") for pred in preds])
 def filter_by_frequency(counts_dict: dict, threshold_percent: float = 0.2):
     """Filter a dict to include items whose value is above `threshold_percent`"""
     total = sum(counts_dict.values())
     threshold = total * threshold_percent
     return {k for k, v in counts_dict.items() if v >= threshold}
-def predict_rows(rows, target_column, language_threshold_percent=0.2):
-    rows = (row.get(target_column) for row in rows)
-    rows = (row for row in rows if row is not None)
-    rows = list(yield_clean_rows(rows))
-    predictions = [model_predict(row) for row in rows]
-    predictions = [pred for pred in predictions if pred is not None]
-    predictions = list(concat(predictions))
-    predictions_by_lang = groupby(get_label, predictions)
-    langues_counts = valmap(len, predictions_by_lang)
-    keys_to_keep = filter_by_frequency(
-        langues_counts, threshold_percent=language_threshold_percent
-    )
-    filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
-    return {
-        "predictions": dict(valmap(get_mean_score, filtered_dict)),
-        "pred": predictions,
-    }
-@app.get("/items/{hub_id}")
-async def predict_language(
-    hub_id: str,
-    config: str | None = None,
-    split: str | None = None,
-    max_request_calls: int = 10,
-    number_of_rows: int = 1000,
-) -> dict[Any, Any]:
-    is_valid = datasets_server_valid_rows(hub_id)
-    if not is_valid:
-        gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
-    if not config:
-        config, split = await get_first_config_and_split_name(hub_id)
-    info = await get_dataset_info(hub_id, config)
-    if info is None:
-        gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
-    if dataset_info := info.get("dataset_info"):
-        total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
-        features = dataset_info.get("features")
-        column_names = set(features.keys())
-        logger.info(f"Column names: {column_names}")
-        if not set(column_names).intersection(TARGET_COLUMN_NAMES):
-            raise gr.Error(
-                f"Dataset {hub_id} {column_names} is not in any of the target columns {TARGET_COLUMN_NAMES}"
             )
-        for column in TARGET_COLUMN_NAMES:
-            if column in column_names:
-                target_column = column
-                logger.info(f"Using column {target_column} for language detection")
-                break
-        random_rows = await get_random_rows(
-            hub_id,
-            total_rows_for_split,
-            number_of_rows,
-            max_request_calls,
-            config,
-            split,
-        )
-        logger.info(f"Predicting language for {len(random_rows)} rows")
-        predictions = predict_rows(random_rows, target_column)
-        predictions["hub_id"] = hub_id
-        predictions["config"] = config
-        predictions["split"] = split
-        return predictions
-@app.get("/")
-def main():
-  app_title = "Language Detection"
-  inputs = [
-       gr.Textbox(
-           None,
-           label="enter content",
-       ),
-       gr.Textbox(None, label="split"),
-  ]
-  interface = gr.Interface(
-       predict_language,
-       inputs=inputs,
-       outputs="json",
-       title=app_title,
-#     article=app_description,
-  )
-  interface.queue()
-  interface.launch()

 import fasttext
 import gradio as gr
 from dotenv import load_dotenv
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import logging
 from toolz import concat, groupby, valmap
 from pathlib import Path
 logger = logging.get_logger(__name__)
 load_dotenv()
     model_path = hf_hub_download(repo_id, filename="model.bin")
     return fasttext.load_model(model_path)
 def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterator[str]:
     for row in rows:
         if isinstance(row, str):
             except TypeError:
                 continue
 FASTTEXT_PREFIX_LENGTH = 9  # fasttext labels are formatted like "__label__eng_Latn"
+# Load the model
 Path("code/models").mkdir(parents=True, exist_ok=True)
 model = fasttext.load_model(
     hf_hub_download(
     )
 )
 def model_predict(inputs: str, k=1) -> list[dict[str, float]]:
     predictions = model.predict(inputs, k=k)
     return [
         for label, prob in zip(predictions[0], predictions[1])
     ]
 def get_label(x):
     return x.get("label")
 def get_mean_score(preds):
     return mean([pred.get("score") for pred in preds])
 def filter_by_frequency(counts_dict: dict, threshold_percent: float = 0.2):
     """Filter a dict to include items whose value is above `threshold_percent`"""
     total = sum(counts_dict.values())
     threshold = total * threshold_percent
     return {k for k, v in counts_dict.items() if v >= threshold}
+def simple_predict(text, num_predictions=3):
+    """Simple language detection function for Gradio interface"""
+    if not text or not text.strip():
+        return "Please enter some text for language detection."
+    try:
+        # Clean the text
+        cleaned_lines = list(yield_clean_rows([text]))
+        if not cleaned_lines:
+            return "No valid text found after cleaning."
+        # Get predictions for each line
+        all_predictions = []
+        for line in cleaned_lines:
+            predictions = model_predict(line, k=num_predictions)
+            all_predictions.extend(predictions)
+        if not all_predictions:
+            return "No predictions could be made."
+        # Group predictions by language
+        predictions_by_lang = groupby(get_label, all_predictions)
+        language_counts = valmap(len, predictions_by_lang)
+        # Calculate average scores for each language
+        language_scores = valmap(get_mean_score, predictions_by_lang)
+        # Format results
+        results = {
+            "detected_languages": dict(language_scores),
+            "language_counts": dict(language_counts),
+            "total_predictions": len(all_predictions),
+            "text_lines_analyzed": len(cleaned_lines)
+        }
+        return results
+    except Exception as e:
+        return f"Error during prediction: {str(e)}"
+def batch_predict(text, threshold_percent=0.2):
+    """More advanced prediction with filtering"""
+    if not text or not text.strip():
+        return "Please enter some text for language detection."
+    try:
+        # Clean the text
+        cleaned_lines = list(yield_clean_rows([text]))
+        if not cleaned_lines:
+            return "No valid text found after cleaning."
+        # Get predictions
+        predictions = [model_predict(line) for line in cleaned_lines]
+        predictions = [pred for pred in predictions if pred is not None]
+        predictions = list(concat(predictions))
+        if not predictions:
+            return "No predictions could be made."
+        # Group and filter
+        predictions_by_lang = groupby(get_label, predictions)
+        language_counts = valmap(len, predictions_by_lang)
+        keys_to_keep = filter_by_frequency(language_counts, threshold_percent=threshold_percent)
+        filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
+        results = {
+            "predictions": dict(valmap(get_mean_score, filtered_dict)),
+            "all_language_counts": dict(language_counts),
+            "filtered_languages": list(keys_to_keep),
+            "threshold_used": threshold_percent
+        }
+        return results
+    except Exception as e:
+        return f"Error during prediction: {str(e)}"
+def build_demo_interface():
+    app_title = "Language Detection Tool"
+    with gr.Blocks(title=app_title) as demo:
+        gr.Markdown(f"# {app_title}")
+        gr.Markdown("Enter text below to detect the language(s) it contains.")
+        with gr.Tab("Simple Detection"):
+            with gr.Row():
+                with gr.Column():
+                    text_input1 = gr.Textbox(
+                        label="Enter text for language detection",
+                        placeholder="Type or paste your text here...",
+                        lines=5
+                    )
+                    num_predictions = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=3,
+                        step=1,
+                        label="Number of top predictions per line"
+                    )
+                    predict_btn1 = gr.Button("Detect Language")
+                with gr.Column():
+                    output1 = gr.JSON(label="Detection Results")
+            predict_btn1.click(
+                simple_predict,
+                inputs=[text_input1, num_predictions],
+                outputs=output1
             )
+        with gr.Tab("Advanced Detection"):
+            with gr.Row():
+                with gr.Column():
+                    text_input2 = gr.Textbox(
+                        label="Enter text for advanced language detection",
+                        placeholder="Type or paste your text here...",
+                        lines=5
+                    )
+                    threshold = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.2,
+                        step=0.1,
+                        label="Threshold percentage for filtering"
+                    )
+                    predict_btn2 = gr.Button("Advanced Detect")
+                with gr.Column():
+                    output2 = gr.JSON(label="Advanced Detection Results")
+            predict_btn2.click(
+                batch_predict,
+                inputs=[text_input2, threshold],
+                outputs=output2
+            )
+        gr.Markdown("### About")
+        gr.Markdown("This tool uses Facebook's FastText language identification model to detect languages in text.")
+    return demo
+if __name__ == "__main__":
+    demo = build_demo_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )