Spaces:

Zwounds
/

Boolean_Search_Query_Model

Runtime error

App Files Files Community

Zwounds commited on Mar 18

Commit

39838a2

verified ·

1 Parent(s): 928fd8e

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

README.md +2 -8
demo.py +172 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Boolean Search Query Model
-emoji: 🐨
-colorFrom: blue
-colorTo: indigo
 sdk: gradio
 sdk_version: 5.21.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Boolean_Search_Query_Model
+app_file: demo.py
 sdk: gradio
 sdk_version: 5.21.0
 ---

demo.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import gradio as gr
+import torch
+from unsloth import FastLanguageModel
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def load_model():
+    """Load fine-tuned model."""
+    logger.info("Loading model...")
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        "boolean_model_merged",
+        max_seq_length=2048,
+        dtype=None,  # Auto-detect
+        load_in_4bit=True
+    )
+    FastLanguageModel.for_inference(model)
+    return model, tokenizer
+def format_prompt(query):
+    """Format query with instruction prompt."""
+    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+Convert this natural language query into a boolean search query by following these rules:
+1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
+   - articles, papers, research, studies
+   - examining, investigating, analyzing
+   - findings, documents, literature
+   - publications, journals, reviews
+   Example: "Research examining X" → just "X"
+2. SECOND: Remove generic implied terms that don't add search value:
+   - Remove words like "practices," "techniques," "methods," "approaches," "strategies"
+   - Remove words like "impacts," "effects," "influences," "role," "applications"
+   - For example: "sustainable agriculture practices" → "sustainable agriculture"
+   - For example: "teaching methodologies" → "teaching"
+   - For example: "leadership styles" → "leadership"
+3. THEN: Format the remaining terms:
+   CRITICAL QUOTING RULES:
+   - Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS
+   - Examples of correct quoting:
+     - Wrong: machine learning AND deep learning
+     - Right: "machine learning" AND "deep learning"
+     - Wrong: natural language processing
+     - Right: "natural language processing"
+   - Single words must NEVER have quotes (e.g., science, research, learning)
+   - Use AND to connect required concepts
+   - Use OR with parentheses for alternatives (e.g., ("soil health" OR biodiversity))
+Example conversions showing proper quoting:
+"Research on machine learning for natural language processing"
+→ "machine learning" AND "natural language processing"
+"Studies examining anxiety depression stress in workplace"
+→ (anxiety OR depression OR stress) AND workplace
+"Articles about deep learning impact on computer vision"
+→ "deep learning" AND "computer vision"
+"Research on sustainable agriculture practices and their impact on soil health or biodiversity"
+→ "sustainable agriculture" AND ("soil health" OR biodiversity)
+"Articles about effective teaching methods for second language acquisition"
+→ teaching AND "second language acquisition"
+### Input:
+{query}
+### Response:
+"""
+def get_boolean_query(query):
+    """Generate boolean query from natural language."""
+    prompt = format_prompt(query)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Tokenize and generate response
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=32,
+        do_sample=False,
+        use_cache=True,
+        eos_token_id=tokenizer.eos_token_id
+    )
+    # Extract response section and clean output
+    full_response = tokenizer.decode(outputs[0])
+    response = full_response.split("### Response:")[-1].strip()
+    # Remove end of text token if present
+    cleaned_response = response.replace("<|end_of_text|>", "").strip()
+    return cleaned_response
+# Load model globally
+logger.info("Initializing model...")
+model, tokenizer = load_model()
+logger.info("Model loaded successfully")
+# Example queries using more natural language
+examples = [
+    # Testing removal of meta-terms
+    ["Find research papers examining the long-term effects of meditation on brain structure"],
+    # Testing removal of generic implied terms (practices, techniques, methods)
+    ["Articles about deep learning techniques for natural language processing tasks"],
+    # Testing removal of impact/effect terms
+    ["Studies on the impact of early childhood nutrition on cognitive development"],
+    # Testing handling of technology applications
+    ["Information on virtual reality applications in architectural design and urban planning"],
+    # Testing proper OR relationship with parentheses
+    ["Research on electric vehicles adoption in urban environments or rural communities"],
+    # Testing proper quoting of multi-word concepts only
+    ["Articles on biodiversity loss in coral reefs and rainforest ecosystems"],
+    # Testing removal of strategy/approach terms
+    ["Studies about different teaching approaches for children with learning disabilities"],
+    # Testing complex OR relationships
+    ["Research examining social media influence on political polarization or public discourse"],
+    # Testing implied terms in specific industries
+    ["Articles about implementation strategies for blockchain in supply chain management or financial services"],
+    # Testing qualifiers that don't add search value
+    ["Research on effective leadership styles in multicultural organizations"],
+    # Testing removal of multiple implied terms
+    ["Studies on the effects of microplastic pollution techniques on marine ecosystem health"],
+    # Testing domain-specific implied terms
+    ["Articles about successful cybersecurity protection methods for critical infrastructure"],
+    # Testing generalized vs specific concepts
+    ["Research papers on quantum computing algorithms for cryptography or optimization problems"],
+    # Testing implied terms in outcome descriptions
+    ["Studies examining the relationship between sleep quality and academic performance outcomes"],
+    # Testing complex nesting of concepts
+    ["Articles about renewable energy integration challenges in developing countries or island nations"]
+]
+# Create Gradio interface with metadata for deployment
+title = "Boolean Search Query Generator"
+description = "Convert natural language queries into boolean search expressions. The model will remove search-related terms (like 'articles', 'research', etc.), handle generic implied terms (like 'practices', 'methods'), and format the core concepts using proper boolean syntax."
+demo = gr.Interface(
+    fn=get_boolean_query,
+    inputs=[
+        gr.Textbox(
+            label="Enter your natural language query",
+            placeholder="e.g., I'm looking for information about climate change and renewable energy"
+        )
+    ],
+    outputs=gr.Textbox(label="Boolean Search Query"),
+    title=title,
+    description=description,
+    examples=examples,
+    theme=gr.themes.Soft()
+)
+if __name__ == "__main__":
+    demo.launch()