Spaces:

PromptMeister
/

keyword-DNA-analyzer

Running

App Files Files Community

PromptMeister commited on Mar 19

Commit

e68ca61

verified ·

1 Parent(s): b1935a9

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -158

app.py CHANGED Viewed

@@ -2,20 +2,60 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 import torch
-from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-from transformers import BertTokenizerFast
 import matplotlib.pyplot as plt
 import json
-# Initialize models
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")
-pos_model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
-pos_tokenizer = BertTokenizerFast.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
-pos_pipeline = pipeline("token-classification", model=pos_model, tokenizer=pos_tokenizer)
-# Intent classification - using zero-shot classification
-intent_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 def get_token_colors(token_type):
     colors = {
@@ -41,7 +81,10 @@ def simulate_historical_data(token):
         values = [45, 50, 60, 70, 75, 80]
     else:
         # Standard pattern for common words
-        base = 50 + (hash(token) % 30)
         noise = np.random.normal(0, 5, 6)
         values = [max(5, min(95, base + i*5 + n)) for i, n in enumerate(noise)]
@@ -60,7 +103,7 @@ def generate_origin_data(token):
     ]
     # Deterministic selection based on the token
-    index = hash(token) % len(origins)
     origin = origins[index]
     note = f"First appeared in {origin['era']} texts derived from {origin['language']}."
@@ -105,155 +148,219 @@ def analyze_token_types(tokens):
     return processed_tokens
 def plot_historical_data(historical_data):
-    """Create a plot of historical usage data"""
-    eras = [item[0] for item in historical_data]
-    values = [item[1] for item in historical_data]
-    plt.figure(figsize=(8, 3))
-    plt.bar(eras, values, color='skyblue')
-    plt.title('Historical Usage')
-    plt.xlabel('Era')
-    plt.ylabel('Usage Level')
-    plt.ylim(0, 100)
-    plt.xticks(rotation=45)
-    plt.tight_layout()
-    return plt
-def analyze_keyword(keyword):
-    if not keyword.strip():
-        return None, None, None, None, None
-    # Basic tokenization
-    words = keyword.strip().lower().split()
-    # Get token types
-    token_analysis = analyze_token_types(words)
-    # Get NER tags
-    ner_results = ner_pipeline(keyword)
-    # Get POS tags
-    pos_results = pos_pipeline(keyword)
-    # Process and organize results
-    full_token_analysis = []
-    for token in token_analysis:
-        # Find POS tag for this token
-        pos_tag = "NOUN"  # Default
-        for pos_result in pos_results:
-            if pos_result["word"].lower() == token["text"]:
-                pos_tag = pos_result["entity"]
-                break
-        # Find entity type if any
-        entity_type = None
-        for ner_result in ner_results:
-            if ner_result["word"].lower() == token["text"]:
-                entity_type = ner_result["entity"]
-                break
-        # Generate historical data
-        historical_data = simulate_historical_data(token["text"])
-        # Generate origin data
-        origin = generate_origin_data(token["text"])
-        # Calculate importance (simplified algorithm)
-        importance = 60 + (len(token["text"]) * 2)
-        importance = min(95, importance)
-        # Generate related terms (simplified)
-        related_terms = [f"{token['text']}-related-1", f"{token['text']}-related-2"]
-        full_token_analysis.append({
-            "token": token["text"],
-            "type": token["type"],
-            "posTag": pos_tag,
-            "entityType": entity_type,
-            "importance": importance,
-            "historicalData": historical_data,
-            "origin": origin,
-            "relatedTerms": related_terms
-        })
-    # Intent analysis
-    intent_result = intent_classifier(
-        keyword,
-        candidate_labels=["informational", "navigational", "transactional"]
-    )
-    intent_analysis = {
-        "type": intent_result["labels"][0].capitalize(),
-        "strength": round(intent_result["scores"][0] * 100),
-        "mutations": [
-            f"{intent_result['labels'][0]}-variation-1",
-            f"{intent_result['labels'][0]}-variation-2"
         ]
-    }
-    # Evolution potential (simplified calculation)
-    evolution_potential = min(95, 65 + (len(keyword) % 30))
-    # Predicted trends (simplified)
-    trends = [
-        "Voice search adaptation",
-        "Visual search integration"
-    ]
-    # Evolution chart data (simulated)
-    evolution_data = [
-        {"month": "Jan", "searchVolume": 1000, "competitionScore": 45, "intentClarity": 80},
-        {"month": "Feb", "searchVolume": 1200, "competitionScore": 48, "intentClarity": 82},
-        {"month": "Mar", "searchVolume": 1100, "competitionScore": 52, "intentClarity": 85},
-        {"month": "Apr", "searchVolume": 1400, "competitionScore": 55, "intentClarity": 88},
-        {"month": "May", "searchVolume": 1800, "competitionScore": 58, "intentClarity": 90},
-        {"month": "Jun", "searchVolume": 2200, "competitionScore": 60, "intentClarity": 92}
-    ]
-    # Create plots
-    evolution_chart = create_evolution_chart(evolution_data)
-    # Generate HTML for token visualization
-    token_viz_html = generate_token_visualization_html(token_analysis, full_token_analysis)
-    # Generate HTML for full analysis
-    analysis_html = generate_full_analysis_html(
-        keyword,
-        full_token_analysis,
-        intent_analysis,
-        evolution_potential,
-        trends
-    )
-    # Generate JSON results
-    json_results = {
-        "keyword": keyword,
-        "tokenAnalysis": full_token_analysis,
-        "intentAnalysis": intent_analysis,
-        "evolutionPotential": evolution_potential,
-        "predictedTrends": trends
-    }
-    return token_viz_html, analysis_html, json_results, evolution_chart, full_token_analysis
-def create_evolution_chart(data):
-    """Create an evolution chart from data"""
-    df = pd.DataFrame(data)
-    plt.figure(figsize=(10, 5))
-    plt.plot(df['month'], df['searchVolume'], marker='o', label='Search Volume')
-    plt.plot(df['month'], df['competitionScore']*20, marker='s', label='Competition Score')
-    plt.plot(df['month'], df['intentClarity']*20, marker='^', label='Intent Clarity')
-    plt.title('Predicted Evolution')
-    plt.xlabel('Month')
-    plt.ylabel('Value')
-    plt.legend()
-    plt.grid(True, linestyle='--', alpha=0.7)
-    plt.tight_layout()
-    return plt
 def generate_token_visualization_html(token_analysis, full_analysis):
     """Generate HTML for token visualization"""
@@ -469,6 +576,10 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(label="Enter keyword to analyze", placeholder="e.g. artificial intelligence")
             analyze_btn = gr.Button("Analyze DNA", variant="primary")
             with gr.Row():
@@ -492,9 +603,15 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
     # Set up event handlers
     analyze_btn.click(
         analyze_keyword,
         inputs=[input_text],
-        outputs=[token_viz_html, analysis_html, json_output, evolution_chart, None]
     )
     # Example buttons
@@ -503,11 +620,18 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
             lambda btn_text: btn_text,
             inputs=[btn],
             outputs=[input_text]
         ).then(
             analyze_keyword,
             inputs=[input_text],
-            outputs=[token_viz_html, analysis_html, json_output, evolution_chart, None]
         )
 # Launch the app
-demo.launch()

 import numpy as np
 import pandas as pd
 import torch
 import matplotlib.pyplot as plt
 import json
+import time
+import os
+from functools import partial
+# Global variables to store models
+tokenizer = None
+ner_pipeline = None
+pos_pipeline = None
+intent_classifier = None
+models_loaded = False
+def load_models(progress=gr.Progress()):
+    """Lazy-load models only when needed"""
+    global tokenizer, ner_pipeline, pos_pipeline, intent_classifier, models_loaded
+    if models_loaded:
+        return True
+    try:
+        progress(0.1, desc="Loading models...")
+        # Use smaller models and load them sequentially to reduce memory pressure
+        from transformers import AutoTokenizer, pipeline
+        progress(0.2, desc="Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        progress(0.4, desc="Loading NER model...")
+        ner_pipeline = pipeline("ner", model="dslim/bert-base-NER")
+        progress(0.6, desc="Loading POS model...")
+        # Use smaller POS model
+        from transformers import AutoModelForTokenClassification, BertTokenizerFast
+        pos_model = AutoModelForTokenClassification.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
+        pos_tokenizer = BertTokenizerFast.from_pretrained("vblagoje/bert-english-uncased-finetuned-pos")
+        pos_pipeline = pipeline("token-classification", model=pos_model, tokenizer=pos_tokenizer)
+        progress(0.8, desc="Loading intent classifier...")
+        # Use a smaller model for zero-shot classification
+        intent_classifier = pipeline(
+            "zero-shot-classification",
+            model="typeform/distilbert-base-uncased-mnli",  # Smaller than BART
+            device=0 if torch.cuda.is_available() else -1   # Use GPU if available
+        )
+        progress(1.0, desc="Models loaded successfully!")
+        models_loaded = True
+        return True
+    except Exception as e:
+        print(f"Error loading models: {str(e)}")
+        return f"Error: {str(e)}"
 def get_token_colors(token_type):
     colors = {
         values = [45, 50, 60, 70, 75, 80]
     else:
         # Standard pattern for common words
+        # Use token hash value modulo instead of hash() directly to avoid different results across runs
+        base = 50 + (sum(ord(c) for c in token) % 30)
+        # Use a fixed seed for reproducibility
+        np.random.seed(sum(ord(c) for c in token))
         noise = np.random.normal(0, 5, 6)
         values = [max(5, min(95, base + i*5 + n)) for i, n in enumerate(noise)]
     ]
     # Deterministic selection based on the token
+    index = sum(ord(c) for c in token) % len(origins)
     origin = origins[index]
     note = f"First appeared in {origin['era']} texts derived from {origin['language']}."
     return processed_tokens
 def plot_historical_data(historical_data):
+    """Create a plot of historical usage data, with error handling"""
+    try:
+        eras = [item[0] for item in historical_data]
+        values = [item[1] for item in historical_data]
+        plt.figure(figsize=(8, 3))
+        plt.bar(eras, values, color='skyblue')
+        plt.title('Historical Usage')
+        plt.xlabel('Era')
+        plt.ylabel('Usage Level')
+        plt.ylim(0, 100)
+        plt.xticks(rotation=45)
+        plt.tight_layout()
+        return plt
+    except Exception as e:
+        print(f"Error in plot_historical_data: {str(e)}")
+        # Return a simple error plot
+        plt.figure(figsize=(8, 3))
+        plt.text(0.5, 0.5, f"Error creating plot: {str(e)}",
+                 horizontalalignment='center', verticalalignment='center')
+        plt.axis('off')
+        return plt
+def create_evolution_chart(data):
+    """Create an evolution chart from data, with error handling"""
+    try:
+        df = pd.DataFrame(data)
+        plt.figure(figsize=(10, 5))
+        plt.plot(df['month'], df['searchVolume'], marker='o', label='Search Volume')
+        plt.plot(df['month'], df['competitionScore']*20, marker='s', label='Competition Score')
+        plt.plot(df['month'], df['intentClarity']*20, marker='^', label='Intent Clarity')
+        plt.title('Predicted Evolution')
+        plt.xlabel('Month')
+        plt.ylabel('Value')
+        plt.legend()
+        plt.grid(True, linestyle='--', alpha=0.7)
+        plt.tight_layout()
+        return plt
+    except Exception as e:
+        print(f"Error in create_evolution_chart: {str(e)}")
+        # Return a simple error plot
+        plt.figure(figsize=(10, 5))
+        plt.text(0.5, 0.5, f"Error creating chart: {str(e)}",
+                 horizontalalignment='center', verticalalignment='center')
+        plt.axis('off')
+        return plt
+def analyze_keyword(keyword, progress=gr.Progress()):
+    """Main function to analyze a keyword"""
+    if not keyword or not keyword.strip():
+        return (
+            "<div>Please enter a keyword to analyze</div>",
+            "<div>Please enter a keyword to analyze</div>",
+            None,
+            None
+        )
+    progress(0.1, desc="Starting analysis...")
+    # Load models if not already loaded
+    model_status = load_models(progress)
+    if isinstance(model_status, str) and model_status.startswith("Error"):
+        return (
+            f"<div style='color:red;'>{model_status}</div>",
+            f"<div style='color:red;'>{model_status}</div>",
+            None,
+            None
+        )
+    try:
+        # Basic tokenization - just split on spaces for simplicity
+        words = keyword.strip().lower().split()
+        progress(0.2, desc="Analyzing tokens...")
+        # Get token types
+        token_analysis = analyze_token_types(words)
+        progress(0.3, desc="Running NER...")
+        # Get NER tags - handle potential errors
+        try:
+            ner_results = ner_pipeline(keyword)
+        except Exception as e:
+            print(f"NER error: {str(e)}")
+            ner_results = []
+        progress(0.4, desc="Running POS tagging...")
+        # Get POS tags - handle potential errors
+        try:
+            pos_results = pos_pipeline(keyword)
+        except Exception as e:
+            print(f"POS error: {str(e)}")
+            pos_results = []
+        # Process and organize results
+        full_token_analysis = []
+        for token in token_analysis:
+            # Find POS tag for this token
+            pos_tag = "NOUN"  # Default
+            for pos_result in pos_results:
+                if pos_result["word"].lower() == token["text"]:
+                    pos_tag = pos_result["entity"]
+                    break
+            # Find entity type if any
+            entity_type = None
+            for ner_result in ner_results:
+                if ner_result["word"].lower() == token["text"]:
+                    entity_type = ner_result["entity"]
+                    break
+            # Generate historical data
+            historical_data = simulate_historical_data(token["text"])
+            # Generate origin data
+            origin = generate_origin_data(token["text"])
+            # Calculate importance (simplified algorithm)
+            importance = 60 + (len(token["text"]) * 2)
+            importance = min(95, importance)
+            # Generate related terms (simplified)
+            related_terms = [f"{token['text']}-related-1", f"{token['text']}-related-2"]
+            full_token_analysis.append({
+                "token": token["text"],
+                "type": token["type"],
+                "posTag": pos_tag,
+                "entityType": entity_type,
+                "importance": importance,
+                "historicalData": historical_data,
+                "origin": origin,
+                "relatedTerms": related_terms
+            })
+        progress(0.6, desc="Analyzing intent...")
+        # Intent analysis - handle potential errors
+        try:
+            intent_result = intent_classifier(
+                keyword,
+                candidate_labels=["informational", "navigational", "transactional"]
+            )
+            intent_analysis = {
+                "type": intent_result["labels"][0].capitalize(),
+                "strength": round(intent_result["scores"][0] * 100),
+                "mutations": [
+                    f"{intent_result['labels'][0]}-variation-1",
+                    f"{intent_result['labels'][0]}-variation-2"
+                ]
+            }
+        except Exception as e:
+            print(f"Intent classification error: {str(e)}")
+            intent_analysis = {
+                "type": "Informational",  # Default fallback
+                "strength": 70,
+                "mutations": ["fallback-variation-1", "fallback-variation-2"]
+            }
+        # Evolution potential (simplified calculation)
+        evolution_potential = min(95, 65 + (len(keyword) % 30))
+        # Predicted trends (simplified)
+        trends = [
+            "Voice search adaptation",
+            "Visual search integration"
         ]
+        # Evolution chart data (simulated)
+        evolution_data = [
+            {"month": "Jan", "searchVolume": 1000, "competitionScore": 45, "intentClarity": 80},
+            {"month": "Feb", "searchVolume": 1200, "competitionScore": 48, "intentClarity": 82},
+            {"month": "Mar", "searchVolume": 1100, "competitionScore": 52, "intentClarity": 85},
+            {"month": "Apr", "searchVolume": 1400, "competitionScore": 55, "intentClarity": 88},
+            {"month": "May", "searchVolume": 1800, "competitionScore": 58, "intentClarity": 90},
+            {"month": "Jun", "searchVolume": 2200, "competitionScore": 60, "intentClarity": 92}
+        ]
+        progress(0.8, desc="Creating visualizations...")
+        # Create plots
+        evolution_chart = create_evolution_chart(evolution_data)
+        # Generate HTML for token visualization
+        token_viz_html = generate_token_visualization_html(token_analysis, full_token_analysis)
+        # Generate HTML for full analysis
+        analysis_html = generate_full_analysis_html(
+            keyword,
+            full_token_analysis,
+            intent_analysis,
+            evolution_potential,
+            trends
+        )
+        # Generate JSON results
+        json_results = {
+            "keyword": keyword,
+            "tokenAnalysis": full_token_analysis,
+            "intentAnalysis": intent_analysis,
+            "evolutionPotential": evolution_potential,
+            "predictedTrends": trends
+        }
+        progress(1.0, desc="Analysis complete!")
+        return token_viz_html, analysis_html, json_results, evolution_chart
+    except Exception as e:
+        error_message = f"<div style='color:red;padding:20px;'>Error analyzing keyword: {str(e)}</div>"
+        print(f"Error in analyze_keyword: {str(e)}")
+        return error_message, error_message, None, None
 def generate_token_visualization_html(token_analysis, full_analysis):
     """Generate HTML for token visualization"""
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(label="Enter keyword to analyze", placeholder="e.g. artificial intelligence")
+            # Add loading indicator
+            status_html = gr.HTML('<div style="color:gray;text-align:center;">Enter a keyword and click "Analyze DNA"</div>')
             analyze_btn = gr.Button("Analyze DNA", variant="primary")
             with gr.Row():
     # Set up event handlers
     analyze_btn.click(
+        lambda: '<div style="color:blue;text-align:center;">Loading models and analyzing... This may take a moment.</div>',
+        outputs=status_html
+    ).then(
         analyze_keyword,
         inputs=[input_text],
+        outputs=[token_viz_html, analysis_html, json_output, evolution_chart]
+    ).then(
+        lambda: '<div style="color:green;text-align:center;">Analysis complete!</div>',
+        outputs=status_html
     )
     # Example buttons
             lambda btn_text: btn_text,
             inputs=[btn],
             outputs=[input_text]
+        ).then(
+            lambda: '<div style="color:blue;text-align:center;">Loading models and analyzing... This may take a moment.</div>',
+            outputs=status_html
         ).then(
             analyze_keyword,
             inputs=[input_text],
+            outputs=[token_viz_html, analysis_html, json_output, evolution_chart]
+        ).then(
+            lambda: '<div style="color:green;text-align:center;">Analysis complete!</div>',
+            outputs=status_html
         )
 # Launch the app
+if __name__ == "__main__":
+    demo.launch()