Spaces:

OrganizedProgrammers
/

Search-Technologies-API

Sleeping

App Files Files Community

ALLOUNE commited on 25 days ago

Commit

ceaeaf3

1 Parent(s): 097e47f

add app

Browse files

Files changed (6) hide show

Dockerfile +25 -0
__init__.py +0 -0
main.py +71 -0
requirements.txt +7 -0
src/__pycache__/processor.cpython-310.pyc +0 -0
src/processor.py +113 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9
+# Create a non-root user for security
+RUN useradd -m -u 1000 user
+USER user
+# Set the working directory in the container
+ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Copy the requirements file into the container
+COPY --chown=user ./requirements.txt requirements.txt
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy the rest of the application code into the container
+COPY --chown=user . .
+# Command to run the application.
+# Hugging Face Spaces expects the app to listen on port 7860.
+# The command points to the `api_app` object inside the `api/main.py` file.
+CMD ["uvicorn", "api.main:api_app", "--host", "0.0.0.0", "--port", "7860"]

__init__.py ADDED Viewed

File without changes

main.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# api/main.py
+from fastapi import FastAPI, HTTPException
+import sentence_transformers
+from huggingface_hub import hf_hub_download
+import pandas as pd
+from src.processor import send_to_dataset,search_and_retrieve,generate_tech
+from typing import List, Dict
+from pydantic import BaseModel
+from datasets import load_dataset
+# This is the main application object that Uvicorn will run
+app = FastAPI(
+    title="My Standalone API",
+    description="An API hosted on Hugging Face Spaces",
+    version="1.0.0"
+)
+model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+dataset = load_dataset("heymenn/Technologies", split="train")
+class SearchInput(BaseModel):
+    title: str
+class SearchOutput(BaseModel):
+    title: str
+    purpose: str
+    score: float
+    top5: List[Dict]
+class GenerateInput(BaseModel):
+    title: str
+    instructions: str
+    force: bool = False
+class GenerateOutput(BaseModel):
+    name: str
+    purpose: str
+    problem_types_solved: str
+    advantages: str
+    limitations: str
+    domain_tags: str
+@app.post("/search-technologies", response_model=SearchOutput)
+def post_search(payload: SearchInput):
+    """
+    Endpoint that returns a search result.
+    """
+    config = {"dataset": dataset, "model": model}
+    res = search_and_retrieve(payload.title, config)
+    return res
+@app.post("/generate-technology", response_model=GenerateOutput)
+def post_generate_and_push(payload: GenerateInput):
+    """
+    Endpoint to generate a technology and push it to the dataset
+    """
+    config = {"dataset": dataset, "model": model}
+    res = search_and_retrieve(payload.title, config)
+    if res["score"] >= 0.7 and not payload.force:
+        raise HTTPException(status_code=500, detail=f"Cannot generate the technology a high score of {res['score']} have been found for the technology : {res['title']}")
+    json_response = generate_tech(payload.title, payload.instructions)
+    send_to_dataset(json_response, model)
+    return json_response

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi
+uvicorn[standard]
+sentence_transformers
+pandas
+fuzzywuzzy
+google.genai
+datasets

src/__pycache__/processor.cpython-310.pyc ADDED Viewed

Binary file (4.28 kB). View file

src/processor.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from fuzzywuzzy import fuzz
+from google.genai import Client, types
+import json
+from datasets import load_dataset
+def search_and_retrieve(user_input, config):
+    dataset = config["dataset"]
+    model = config["model"]
+    user_embedding = model.encode(user_input)
+    results = []
+    max_result = {"score":0, "technology": "", "type":""}
+    for row in dataset:
+        name = row["name"]
+        purpose = row["purpose"]
+        cosim = model.similarity(row["embeddings"], user_embedding)
+        token_set_ratio = fuzz.token_set_ratio(user_input, name)
+        fuzzy_score = token_set_ratio / 100
+        alpha = 0.6
+        combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
+        result = {"title": name, "purpose": purpose, "score": combined_score.item()}
+        if combined_score > max_result["score"]:
+            max_result = result
+        results.append(result)
+    top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[1:6]
+    max_result["top5"] = top_5
+    return max_result
+def generate_tech(user_input, user_instructions):
+    prompt = f"""
+    # ROLE
+    You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.
+    # OBJECTIVE
+    Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
+    Create a complete JSON object according to the schema below.
+    Your final output must be a single, valid JSON document containing a technology you created.
+    The technology should be described with sentences.
+    # INSTRUCTIONS & RULES
+    1.  **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
+    Do not include any explanatory text before or after the JSON.
+    2.  **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
+    3.  **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
+    Do not use single keywords.
+    4.  **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.
+    # YAML SCHEMA & EXAMPLE
+    Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.
+    {{"name": "Generative Watermarking"
+      "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
+      "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
+      "advantages": "Way faster to generate by an AI"
+      "limitations": "Takes a lot of computational time to generate"
+      "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
+    }}
+    Take into account those additionnal informations if there is any:
+    {user_instructions}
+    ---
+    ***NOW, BEGIN THE TASK.***
+    <USER_INPUT>
+    {user_input}
+    </USER_INPUT>
+    """
+    client = Client(api_key="AIzaSyCHcw26RRXZAlb1twhc_cRkoqCFW3e8QKk")
+    # Define the grounding tool
+    grounding_tool = types.Tool(
+        google_search=types.GoogleSearch()
+    )
+    # Configure generation settings
+    config = types.GenerateContentConfig(
+        tools=[grounding_tool]
+    )
+    response = client.models.generate_content(
+        model="gemini-2.5-flash",
+        contents=prompt,
+        config=config,
+    )
+    data = response.text
+    data = data[data.find("{"):data.find("}")+1].replace('\n','')
+    json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))
+    return json_data
+def send_to_dataset(data, model):
+    data_embedding = model.encode(str(data))
+    data["embeddings"] = data_embedding
+    dataset = load_dataset("heymenn/Technologies", split="train")
+    updated_dataset = dataset.add_item(data)
+    updated_dataset.push_to_hub("heymenn/Technologies")