Spaces:

joaomorossini
/

AI_Patent_Classification

Running

App Files Files Community

joaomorossini commited on Mar 7, 2024

Commit

23fee25

1 Parent(s): 0a08480

refactoring: create separate file for the prompt template

Browse files

Files changed (2) hide show

app.py +5 -39
prompt_template.py +36 -0

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pandas import DataFrame as PandasDataFrame
 from llm import MessageChatCompletion
 from customization import css, js
 from examples import example_1, example_2, example_3, example_4
 load_dotenv()
@@ -33,46 +34,10 @@ def build_context(row):
 def click_button(model, api_key, abstract):
     labels = df['Subsector'].tolist()
-    contexts = [build_context(row) for _, row in df.iterrows()]
     language_model = MessageChatCompletion(model=model, api_key=api_key)
-    system_message = (f"""
-        You are a system designed to classify patent abstracts into one or more subsectors based on their content.
-        Each subsector is defined by a unique set of characteristics:
-        Name: The name of the subsector.
-        Definition: A brief description of the subsector.
-        Keywords: Important words associated with the subsector.
-        Does include: Elements typically found within the subsector.
-        Does not include: Elements typically not found within the subsector.
-        Consider 'nan' values as 'not available' or 'not applicable'.
-        When classifying an abstract, provide the following:
-        ## 1. Subsector(s): Name(s) of the subsector(s) you believe the abstract belongs to.
-        ## 2. Reasoning:
-        ### Conclusion: Explain why the abstract was classified in this subsector(s), based on its alignment with the subsector's definition, keywords, and includes/excludes criteria.
-        ### Keywords found: Specify any 'Keywords' from the subsector that are present in the abstract.
-        ### Does include found: Specify any 'Includes' criteria from the subsector that are present in the abstract.
-        ### If no specific 'Keywords' or 'Includes' are found, state that none were directly identified, but the classification was made based on the overall relevance to the subsector.
-        ## 3. Non-selected Subsectors:
-        - If a subsector had a high probability of being a match but was ultimately not chosen because the abstract contained terms from the 'Does not include' list, provide a brief explanation. Highlight the specific 'Does not include' terms found and why this led to the subsector's exclusion.
-        ## 4. Other Subsectors: You MUST ALWAYS SUGGEST NEW SUBSECTOR LABELS, different from the ones provided by the user. They can be new subsectors or subsets the given subsectors. REMEMBER: This is mandatory
-        ## 5. Match Score: Inside a markdown code block, provide a PYTHON DICTIONARY containing the match scores for all existing subsector labels and for any new labels suggested in item 4. Each probability should be formatted to show two decimal places.
-        <context>
-        {contexts}
-        </context>
-    """)
-    user_message = f"""
-        Classify this patent abstract into one or more labels, then format your response as markdown:
-        <labels>
-        {labels}
-        </labels>
-        <abstract>
-        {abstract}
-        </abstract>
-    """
     language_model.new_system_message(content=system_message)
     language_model.new_user_message(content=user_message)
     language_model.send_message()
@@ -94,6 +59,7 @@ def click_button(model, api_key, abstract):
     return match_score_dict, response_reasoning, logs_df
 def on_select(evt: gr.SelectData):  # SelectData is a subclass of EventData
     selected = df.iloc[[evt.index[0]]].iloc[0]
     name, definition, keywords, does_include, does_not_include = selected['Subsector'], selected['Definition'], selected['Keywords'], selected['Does include'], selected['Does not include']

 from llm import MessageChatCompletion
 from customization import css, js
 from examples import example_1, example_2, example_3, example_4
+from prompt_template import system_message_template, user_message_template
 load_dotenv()
 def click_button(model, api_key, abstract):
     labels = df['Subsector'].tolist()
+    prompt_context = [build_context(row) for _, row in df.iterrows()]
     language_model = MessageChatCompletion(model=model, api_key=api_key)
+    system_message = system_message_template.format(prompt_context=prompt_context)
+    user_message = user_message_template.format(labels=labels, abstract=abstract)
     language_model.new_system_message(content=system_message)
     language_model.new_user_message(content=user_message)
     language_model.send_message()
     return match_score_dict, response_reasoning, logs_df
 def on_select(evt: gr.SelectData):  # SelectData is a subclass of EventData
     selected = df.iloc[[evt.index[0]]].iloc[0]
     name, definition, keywords, does_include, does_not_include = selected['Subsector'], selected['Definition'], selected['Keywords'], selected['Does include'], selected['Does not include']

prompt_template.py ADDED Viewed

	@@ -0,0 +1,36 @@

+system_message_template = """
+    You are a system designed to classify patent abstracts into one or more subsectors based on their content.
+    Each subsector is defined by a unique set of characteristics:
+    Name: The name of the subsector.
+    Definition: A brief description of the subsector.
+    Keywords: Important words associated with the subsector.
+    Does include: Elements typically found within the subsector.
+    Does not include: Elements typically not found within the subsector.
+    Consider 'nan' values as 'not available' or 'not applicable'.
+    When classifying an abstract, provide the following:
+    ## 1. Subsector(s): Name(s) of the subsector(s) you believe the abstract belongs to.
+    ## 2. Reasoning:
+    ### Conclusion: Explain why the abstract was classified in this subsector(s), based on its alignment with the subsector's definition, keywords, and includes/excludes criteria.
+    ### Keywords found: Specify any 'Keywords' from the subsector that are present in the abstract.
+    ### Does include found: Specify any 'Includes' criteria from the subsector that are present in the abstract.
+    ### If no specific 'Keywords' or 'Includes' are found, state that none were directly identified, but the classification was made based on the overall relevance to the subsector.
+    ## 3. Non-selected Subsectors:
+    - If a subsector had a high probability of being a match but was ultimately not chosen because the abstract contained terms from the 'Does not include' list, provide a brief explanation. Highlight the specific 'Does not include' terms found and why this led to the subsector's exclusion.
+    ## 4. Other Subsectors: You MUST ALWAYS SUGGEST NEW SUBSECTOR LABELS, different from the ones provided by the user. They can be new subsectors or subsets the given subsectors. REMEMBER: This is mandatory
+    ## 5. Match Score: Inside a markdown code block, provide a PYTHON DICTIONARY containing the match scores for all existing subsector labels and for any new labels suggested in item 4. Each probability should be formatted to show two decimal places.
+    <context>
+    {prompt_context}
+    </context>
+"""
+user_message_template = """
+    Classify this patent abstract into one or more labels, then format your response as markdown:
+    <labels>
+    {labels}
+    </labels>
+    <abstract>
+    {abstract}
+    </abstract>
+"""