|
""" |
|
Stores the detailed metadata for the Solr index fields. |
|
|
|
This information is crucial for the LLM to understand the data schema, |
|
enabling it to construct accurate and efficient Solr queries. Separating it |
|
into its own module keeps the main application logic cleaner. |
|
""" |
|
|
|
field_metadata = [ |
|
{ |
|
"field_name": "business_model", |
|
"type": "string (categorical)", |
|
"example_values": ["pharma/bio", "drug delivery", "pharma services"], |
|
"definition": "The primary business category of the company involved in the news. Use for filtering by high-level industry segments." |
|
}, |
|
{ |
|
"field_name": "news_type", |
|
"type": "string (categorical)", |
|
"example_values": ["product news", "financial news", "regulatory news"], |
|
"definition": "The category of the news article itself (e.g., financial, regulatory, acquisition). Use for filtering by the type of event being reported." |
|
}, |
|
{ |
|
"field_name": "event_type", |
|
"type": "string (categorical)", |
|
"example_values": ["phase 2", "phase 1", "pre clinical", "marketed"], |
|
"definition": "The clinical or developmental stage of a product or event discussed in the article. Essential for queries about clinical trial phases." |
|
}, |
|
{ |
|
"field_name": "source", |
|
"type": "string (categorical)", |
|
"example_values": ["Press Release", "PR Newswire", "Business Wire"], |
|
"definition": "The original source of the news article, such as a newswire or official report." |
|
}, |
|
{ |
|
"field_name": "company_name", |
|
"type": "string (exact match, for faceting)", |
|
"example_values": ["pfizer inc.", "astrazeneca plc", "roche"], |
|
"definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching." |
|
}, |
|
{ |
|
"field_name": "company_name_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"], |
|
"definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting." |
|
}, |
|
{ |
|
"field_name": "territory_hq_s", |
|
"type": "string (multi-valued, hierarchical)", |
|
"example_values": ["united states of america", "europe", "europe western"], |
|
"definition": "The geographic location (country and continent) of a company's headquarters. It is hierarchical. Use for filtering by location." |
|
}, |
|
{ |
|
"field_name": "therapeutic_category", |
|
"type": "string (specific)", |
|
"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"], |
|
"definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries." |
|
}, |
|
{ |
|
"field_name": "therapeutic_category_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["cancer", "oncology", "infections", "cns"], |
|
"definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter." |
|
}, |
|
{ |
|
"field_name": "compound_name", |
|
"type": "string (exact match, for faceting)", |
|
"example_values": ["opdivo injection solution", "keytruda injection solution"], |
|
"definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds." |
|
}, |
|
{ |
|
"field_name": "compound_name_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"], |
|
"definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name." |
|
}, |
|
{ |
|
"field_name": "molecule_name", |
|
"type": "string (exact match, for faceting)", |
|
"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"], |
|
"definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules." |
|
}, |
|
{ |
|
"field_name": "molecule_name_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["cbd", "s1-220", "a1002n5s"], |
|
"definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name." |
|
}, |
|
{ |
|
"field_name": "highest_phase", |
|
"type": "string (categorical)", |
|
"example_values": ["marketed", "phase 2", "phase 1"], |
|
"definition": "The highest stage of development a drug has ever reached." |
|
}, |
|
{ |
|
"field_name": "drug_delivery_branch_s", |
|
"type": "string (multi-valued, for searching)", |
|
"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"], |
|
"definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms." |
|
}, |
|
{ |
|
"field_name": "drug_delivery_branch", |
|
"type": "string (categorical, specific, for faceting)", |
|
"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"], |
|
"definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies." |
|
}, |
|
{ |
|
"field_name": "route_branch", |
|
"type": "string (categorical)", |
|
"example_values": ["injection", "oral", "topical", "inhalation"], |
|
"definition": "The primary route of drug administration. Good for faceting on exact routes." |
|
}, |
|
{ |
|
"field_name": "molecule_api_group", |
|
"type": "string (categorical)", |
|
"example_values": ["small molecules", "biologics", "nucleic acids"], |
|
"definition": "High-level classification of the drug's molecular type." |
|
}, |
|
{ |
|
"field_name": "content", |
|
"type": "text (full-text search)", |
|
"example_values": ["The largest study to date...", "balstilimab..."], |
|
"definition": "The full text content of the news article. Use for keyword searches on topics not covered by other specific fields." |
|
}, |
|
{ |
|
"field_name": "date", |
|
"type": "date", |
|
"example_values": ["2020-10-22T00:00:00Z"], |
|
"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries." |
|
}, |
|
{ |
|
"field_name": "date_year", |
|
"type": "number (year)", |
|
"example_values": [2020, 2021, 2022], |
|
"definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')." |
|
}, |
|
{ |
|
"field_name": "total_deal_value_in_million", |
|
"type": "number (metric)", |
|
"example_values": [50, 120.5, 176.157, 1000], |
|
"definition": "The total value of a financial deal, in millions of USD. This is the primary numeric field for financial aggregations (sum, avg, etc.). To use this, you must also filter for news that has a deal value, e.g., 'total_deal_value_in_million:[0 TO *]'." |
|
} |
|
] |
|
|
|
def format_metadata_for_prompt(): |
|
"""Formats the field metadata into a string for the LLM prompt.""" |
|
formatted_string = "" |
|
for field in field_metadata: |
|
formatted_string += f"- **{field['field_name']}**\n" |
|
formatted_string += f" - **Type**: {field['type']}\n" |
|
formatted_string += f" - **Definition**: {field['definition']}\n" |
|
formatted_string += f" - **Examples**: {', '.join(map(str, field['example_values']))}\n\n" |
|
return formatted_string |
|
|