Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import pandas as pd | |
| # from langchain.chat_models import AzureChatOpenAI | |
| from langchain_openai import AzureChatOpenAI | |
| from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser | |
| from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate | |
| from pydantic import BaseModel, Field, validator | |
| from langchain.output_parsers.enum import EnumOutputParser | |
| from langchain_core.prompts import PromptTemplate | |
| from enum import Enum | |
| #os.environ["LANGCHAIN_TRACING_V2"]="true" | |
| #os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com" | |
| #LANGCHAIN_API_KEY = st.secrets['LANGCHAIN_API_KEY'] | |
| #os.environ["LANGCHAIN_PROJECT"]="UC2e2e" | |
| # LLM Langchain Definition | |
| OPENAI_API_KEY = st.secrets['OPENAI_API_KEY'] | |
| OPENAI_API_TYPE = "azure" | |
| OPENAI_API_BASE = "https://davidfearn-gpt4.openai.azure.com" | |
| OPENAI_API_VERSION = "2024-08-01-preview" | |
| OPENAI_MODEL = "gpt-4o-mini" | |
| # Function to read file contents | |
| def read_file(file): | |
| """ | |
| Reads the content of a text file and returns it as a string. | |
| :param file: The file name to read from the 'assets' directory. | |
| :return: The content of the file as a string or None if an error occurs. | |
| """ | |
| fp = f"assets/{file}.md" | |
| try: | |
| with open(fp, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| return content | |
| except FileNotFoundError: | |
| print(f"The file at {fp} was not found.") | |
| except IOError: | |
| print(f"An error occurred while reading the file at {fp}.") | |
| return None | |
| # Function to generate structured insights | |
| def process_insight(chunk, topic,source): | |
| GSKGlossary = read_file("GSKGlossary") | |
| if source== "intl": | |
| SystemMessage = read_file("intl_insight_system_message") | |
| UserMessage = read_file("intl_insight_user_message") | |
| else: | |
| SystemMessage = read_file("ext_insight_system_message") | |
| UserMessage = read_file("ext_insight_user_message") | |
| class Insights(BaseModel): | |
| completed: bool = Field(description="This field is used to indicate that you think the number of insights has been completed") | |
| insight: str = Field(description="This field is used to return the MECE insight in string format") | |
| llm = AzureChatOpenAI( | |
| openai_api_version=OPENAI_API_VERSION, | |
| openai_api_key=OPENAI_API_KEY, | |
| azure_endpoint=OPENAI_API_BASE, | |
| openai_api_type=OPENAI_API_TYPE, | |
| deployment_name=OPENAI_MODEL, | |
| temperature=0, | |
| ) | |
| system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) | |
| structured_llm = llm.with_structured_output(Insights) | |
| prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) | |
| chain = prompt | structured_llm | |
| new_insights = [] | |
| insights_data = [] | |
| while True: | |
| # Invoke the LLM with the current chunk and existing insights | |
| counter = 5 - len(new_insights) | |
| new_insight_response = chain.invoke({"chunk": chunk, "existing_insights": new_insights, "counter": counter, "GSKGlossary": GSKGlossary, "topic":topic}) | |
| classification = selectClass(new_insight_response.insight) | |
| # Append the new insight to the list | |
| new_insights.append(new_insight_response.insight) | |
| insights_data.append({ | |
| # "completed": new_insight_response.completed, | |
| "classification": classification, | |
| "insight": new_insight_response.insight, | |
| "chunk": chunk | |
| }) | |
| # Check if "completed" is True or the list of "new_insights" is >= 3 | |
| if new_insight_response.completed and len(new_insights) >= 3: | |
| return pd.DataFrame(insights_data) | |
| # If the list of "new_insights" reaches 5, return the list | |
| if len(new_insights) == 5: | |
| return pd.DataFrame(insights_data) | |
| def selectClass(insight): | |
| classification_system_message = read_file("classification_system_message") | |
| classification_user_message = read_file("classification_user_message") | |
| class InsightClassification(Enum): | |
| IMPACT = "impact" | |
| CONSULTATION = "consultation" | |
| AWARENESS = "awareness" | |
| llm = AzureChatOpenAI( | |
| openai_api_version=OPENAI_API_VERSION, | |
| openai_api_key=OPENAI_API_KEY, | |
| azure_endpoint=OPENAI_API_BASE, | |
| openai_api_type=OPENAI_API_TYPE, | |
| deployment_name=OPENAI_MODEL, | |
| temperature=0, | |
| ) | |
| parser = EnumOutputParser(enum=InsightClassification) | |
| system_message_template = SystemMessagePromptTemplate.from_template(classification_system_message) | |
| # structured_llm = llm.with_structured_output(Insights) | |
| prompt = ChatPromptTemplate.from_messages([system_message_template, classification_user_message]).partial(options=parser.get_format_instructions()) | |
| chain = prompt | llm | parser | |
| result = chain.invoke({"insight": insight}) | |
| return result.value | |
| def process_chunks(chunk, topic,source): | |
| """ | |
| Processes chunks from a specific dataframe column, invokes the get_structured function for each chunk, | |
| and combines the resulting dataframes into one dataframe. | |
| :param df: The dataframe containing chunks. | |
| :param temp: Temperature parameter for the LLM. | |
| :param SystemMessage: System message template. | |
| :param UserMessage: User message template. | |
| :param completedMessage: Completion message description. | |
| :param insightMessage: Insight message description. | |
| :param chunk_column: The name of the column containing text chunks to process. | |
| :return: A combined dataframe of insights from all chunks. | |
| """ | |
| all_insights = [] | |
| for chunk in chunk["ChunkText"]: | |
| insights_df = process_insight(chunk, topic,source) | |
| all_insights.append(insights_df) | |
| return pd.concat(all_insights, ignore_index=True) | |
| def evaluation_llm(chunk, topic , source): | |
| GSKGlossary = read_file("GSKGlossary") | |
| if source == "intl": | |
| SystemMessage = read_file("intl_eval_system_message") | |
| UserMessage = read_file("intl_eval_user_message") | |
| else: | |
| SystemMessage = read_file("ext_eval_system_message") | |
| UserMessage = read_file("ext_eval_user_message") | |
| class Evaluate(BaseModel): | |
| decision: bool = Field(description="True: The content of the document relates to the topic.False: The content of the document does not relate to the topic.") | |
| justification: str = Field(description="Please justify your decision in a logical and structured way.") | |
| llm = AzureChatOpenAI( | |
| openai_api_version=OPENAI_API_VERSION, | |
| openai_api_key=OPENAI_API_KEY, | |
| azure_endpoint=OPENAI_API_BASE, | |
| openai_api_type=OPENAI_API_TYPE, | |
| deployment_name=OPENAI_MODEL, | |
| temperature=0, | |
| ) | |
| system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) | |
| structured_llm = llm.with_structured_output(Evaluate) | |
| # Create a chat prompt template combining system and human messages | |
| prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) | |
| chain = prompt | structured_llm | |
| return chain.invoke({ | |
| "chunk": chunk, | |
| "topic": topic, | |
| "GSKGlossary": GSKGlossary | |
| }) | |
| def evaluation_process(df_chunks, topic,source): | |
| """ | |
| Iterates over chunks in the DataFrame and processes them using `get_structured`. | |
| :param df_chunks: DataFrame containing chunks. | |
| :param systemMessage: System message for evaluation. | |
| :param userMessage: User message template for evaluation. | |
| :param temp: Temperature setting for the model. | |
| :param decisionMessage: Description for decision field. | |
| :param justificationMessage: Description for justification field. | |
| :return: Updated DataFrame with decision and justification columns and consensus value. | |
| """ | |
| decisions = [] | |
| justifications = [] | |
| # Avoid re-inserting columns if they already exist | |
| if "Decision" in df_chunks.columns: | |
| df_chunks = df_chunks.drop(columns=["Decision", "Justification"]) | |
| for _, chunk in df_chunks.iterrows(): | |
| result = evaluation_llm(chunk['ChunkText'], topic,source) | |
| decisions.append("True" if result.decision else "False") # Convert bool to string | |
| justifications.append(result.justification) | |
| # Add new columns to the DataFrame | |
| df_chunks.insert(0, "Decision", decisions) | |
| df_chunks.insert(1, "Justification", justifications) | |
| # Count all True/False values for consensus and get most frequent value | |
| consensus_count = df_chunks["Decision"].value_counts() | |
| consensus_value = consensus_count.idxmax() # Most frequently occurring value | |
| return df_chunks, consensus_value, consensus_count | |
| def process_compare(insight_df, sopChunk_df, topic): | |
| GSKGlossary = read_file("GSKGlossary") | |
| SystemMessage = read_file("compare_system_message") | |
| UserMessage = read_file("compare_user_message") | |
| # Define the structured output model | |
| class Compare(BaseModel): | |
| review: bool = Field(description="This field is used to indicate whether a review is needed") | |
| justification: str = Field(description="This field is used to justify why a review is needed") | |
| # Initialize the LLM | |
| llm = AzureChatOpenAI( | |
| openai_api_version=OPENAI_API_VERSION, | |
| openai_api_key=OPENAI_API_KEY, | |
| azure_endpoint=OPENAI_API_BASE, | |
| openai_api_type=OPENAI_API_TYPE, | |
| deployment_name=OPENAI_MODEL, | |
| temperature=0, | |
| ) | |
| # Create the structured output and prompt chain | |
| system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) | |
| structured_llm = llm.with_structured_output(Compare) | |
| prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) | |
| chain = prompt | structured_llm | |
| compare_data = [] | |
| # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight" | |
| for sopChunk_index, sopChunk_row in sopChunk_df.iterrows(): | |
| sop_chunk_text = sopChunk_row["ChunkText"] # Extract the ChunkText column | |
| for insight_index, insight_row in insight_df.iterrows(): | |
| insight_text = insight_row["insight"] # Extract the insight column | |
| # Invoke the LLM with the extracted data | |
| compare_response = chain.invoke({ | |
| "sopChunk": sop_chunk_text, | |
| "insight": insight_text, | |
| "topic": topic, | |
| "GSKGlossary": GSKGlossary | |
| }) | |
| # Append the response to insights_data | |
| compare_data.append({ | |
| "ReviewNeeded": compare_response.review, | |
| "Justification": compare_response.justification, | |
| "SOP": sop_chunk_text, | |
| "Insight": insight_text | |
| }) | |
| # Return the insights as a single DataFrame | |
| print(compare_data) | |
| return pd.DataFrame(compare_data) | |
| def risk_score_process(compare_df, topic): | |
| GSKGlossary = read_file("GSKGlossary") | |
| SystemMessage = read_file("risk_scoring_system_message") | |
| UserMessage = read_file("risk_scoring_user_message") | |
| # Define the Enum for predefined options | |
| class RiskClassification(str, Enum): | |
| HIGH = "high" | |
| MEDIUM = "medium" | |
| LOW = "low" | |
| # Define the Pydantic model for the structured output | |
| class Risk(BaseModel): | |
| risk_level: RiskClassification = Field( | |
| description="The selected classification option." | |
| ) | |
| justification: str = Field( | |
| description="Justify the reason for choosing this risk classification." | |
| ) | |
| advice: str = Field( | |
| description="Suggestions for changes that could be made to the standard operating procedure to mitigat the risk." | |
| ) | |
| llm = AzureChatOpenAI( | |
| openai_api_version=OPENAI_API_VERSION, | |
| openai_api_key=OPENAI_API_KEY, | |
| azure_endpoint=OPENAI_API_BASE, | |
| openai_api_type=OPENAI_API_TYPE, | |
| deployment_name=OPENAI_MODEL, | |
| temperature=0, | |
| ) | |
| system_message_template = SystemMessagePromptTemplate.from_template(SystemMessage) | |
| structured_llm = llm.with_structured_output(Risk) | |
| prompt = ChatPromptTemplate.from_messages([system_message_template, UserMessage]) | |
| chain = prompt | structured_llm | |
| risk_data = [] | |
| # Iterate over sopChunk_df and insight_df to process "ChunkText" and "insight" | |
| for index, row in compare_df.iterrows(): | |
| # Invoke the LLM with the extracted data | |
| risk_response = chain.invoke({ | |
| "comparison": row['Justification'], | |
| "insight": row['Insight'], | |
| "SOPchunk":row['SOP'], | |
| "topic": topic | |
| }) | |
| # Append the response to insights_data | |
| risk_data.append({ | |
| "RiskLevel": risk_response.risk_level, | |
| "Justification": risk_response.justification, | |
| "advice": risk_response.advice, | |
| "comparison": row['Justification'], | |
| "insight": row['Insight'], | |
| "SOPchunk":row['SOP'] | |
| }) | |
| # Return the insights as a single DataFrame | |
| return pd.DataFrame(risk_data) |