File size: 2,522 Bytes
4cbe4e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from typing import List, Dict, Any, Optional
import logging
from elasticsearch import exceptions

from elastic.es_client import get_es_client

logger = logging.getLogger(__name__)

es_client = get_es_client()


def search_certification_chunks(
    index_name: str,
    text_query: str,
    vector_query: List[float],
    certification_name: str,
    es_client=es_client,
    vector_field: str = "embedding",
    text_field: str = "chunk",
    size: int = 5,
    min_score: float = 0.1,  # Lowered threshold
    boost_text: float = 1.0,
    boost_vector: float = 1.0,
) -> List[Dict[str, Any]]:

    # First verify the certification value exists
    cert_check = es_client.search(
        index=index_name,
        body={
            "query": {"term": {"certification": certification_name}},
            "size": 1,
        },
    )

    if not cert_check["hits"]["hits"]:
        logger.error(f"No documents found with certification: {certification_name}")
        return []

    # Then proceed with hybrid search
    query_body = {
        "size": size,
        "query": {
            "bool": {
                "should": [
                    {"match": {"chunk": text_query}},
                    {
                        "script_score": {
                            "query": {"match_all": {}},
                            "script": {
                                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                                "params": {"query_vector": vector_query},
                            },
                        }
                    },
                ]
            }
        },
    }
    logger.debug(f"Elasticsearch query body: {query_body}")

    logger.info(f"Executing search on index '{index_name}'")
    response = es_client.search(index=index_name, body=query_body, routing=cert_check["hits"]["hits"][0]["_id"])
    hits = response.get("hits", {}).get("hits", [])
    logger.info(f"Found {len(hits)} matching documents")

    # Process results with correct field names
    results = [
        {
            "id": hit["_id"],
            "score": hit["_score"],
            "text": hit["_source"]["chunk"],
            "source_file": hit["_source"]["source_file"],
        }
        for hit in hits
    ]
    
    if results:
        logger.debug(f"Top result score: {results[0]['score']}")
        logger.debug(f"Top result source: {results[0]['source_file']}")
    else:
        logger.warning("No results returned from Elasticsearch")

    return results