Spaces:
Runtime error
Runtime error
| # -- Import libraries | |
| from langchain.prompts import PromptTemplate | |
| from PIL import Image | |
| from streamlit.logger import get_logger | |
| from streamlit_player import st_player | |
| import pandas as pd | |
| import streamlit as st | |
| import urllib.request | |
| import argparse | |
| import together | |
| import logging | |
| import requests | |
| import utils | |
| import spacy | |
| import time | |
| import os | |
| import re | |
| st.set_page_config(layout="wide") | |
| def get_args(): | |
| # -- 1. Setup arguments | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--DEFAULT_SYSTEM_PROMPT_LINK', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt", help='Valor para DEFAULT_SYSTEM_PROMPT_LINK') | |
| parser.add_argument('--PODCAST_URL_VIDEO_PATH', type=str, default="https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv", help='Valor para PODCAST_URL_VIDEO_PATH') | |
| parser.add_argument('--TRANSCRIPTION', type=str, default='worldcast_roberto_vaquero', help='Name of the trascription') | |
| parser.add_argument('--MODEL', type=str, default='togethercomputer/llama-2-13b-chat', help='Model name') | |
| parser.add_argument('--EMB_MODEL', type=str, default='sentence-transformers/paraphrase-multilingual-mpnet-base-v2', help='Embedding model name') | |
| os.system("python -m spacy download es_core_news_lg") | |
| # -- 2. Setup env and logger | |
| os.environ["TOGETHER_API_KEY"] = "6101599d6e33e3bda336b8d007ca22e35a64c72cfd52c2d8197f663389fc50c5" | |
| logger = get_logger(__name__) | |
| # -- 3. Setup constants | |
| args = parser.parse_args() | |
| return args, logger | |
| def get_podcast_data(path): | |
| podcast_url_video_df = pd.read_csv(path, sep=';') | |
| return podcast_url_video_df | |
| def get_basics_comp(emb_model, model, default_system_prompt_link, _logger, podcast_url_video_df, img_size=100): | |
| r = requests.get("https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif", stream=True) | |
| icon = Image.open(r.raw) | |
| icon = icon.resize((img_size, img_size)) | |
| with st.sidebar.container(): | |
| st.markdown( | |
| """ | |
| <head> | |
| <style> | |
| .footer1 { | |
| text-align: center; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="footer1"> | |
| <img src=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/media/castena-animated-icon.gif width="150" height="150"> | |
| </div> | |
| <br> | |
| </body> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| genre = st.sidebar.radio( | |
| "Seleccione el LLM", | |
| ["LLAMA", "GPT (not available yet)"] | |
| ) | |
| st.sidebar.info('Modelo LLAMA: ' + str(model).split('/')[-1] + '\nModelo GPT: gpt-3.5-turbo', icon="ℹ️") | |
| podcast_list = list(podcast_url_video_df['podcast_name_lit'].apply(lambda x: x.replace("'", ""))) | |
| video_option = st.sidebar.selectbox( | |
| "Seleccione el podcast", | |
| podcast_list, | |
| on_change=clean_chat | |
| ) | |
| # -- Add icons | |
| with st.sidebar.container(): | |
| st.markdown( | |
| """ | |
| <head> | |
| <style> | |
| .footer2 { | |
| position: fixed; | |
| bottom: 2%; | |
| left: 6.5%; | |
| } | |
| .footer2 a { | |
| margin: 10px; | |
| text-decoration: none; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="footer2"> | |
| <a href="https://www.linkedin.com/in/alberto-fernandez-hernandez-3a3474136"> | |
| <img src="https://cdn-icons-png.flaticon.com/128/3536/3536505.png" width="32" height="32"> | |
| </a> | |
| <a href="https://github.com/AlbertoUAH/Castena"> | |
| <img src="https://cdn-icons-png.flaticon.com/128/733/733553.png" width="32" height="32"> | |
| </a> | |
| <a href="https://www.buymeacoffee.com/castena"> | |
| <img src="https://cdn-icons-png.flaticon.com/128/761/761767.png" width="32" height="32"> | |
| </a> | |
| </div> | |
| </body> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| video_option_joined = '_'.join(video_option.replace(': Entrevista a ', ' ').lower().split(' ')).replace("\'", "") | |
| video_option_joined_path = "{}_transcription.txt".format(video_option_joined) | |
| youtube_video_url = list(podcast_url_video_df[podcast_url_video_df['podcast_name'].str.contains(video_option_joined)]['youtube_video_url'])[0].replace("\'", "") | |
| st.title("[Podcast: {}]({})".format(video_option.replace("'", "").title(), youtube_video_url)) | |
| # -- 4. Setup request for system prompt | |
| f = urllib.request.urlopen(default_system_prompt_link) | |
| default_system_prompt = str(f.read(), 'UTF-8') | |
| # -- 5. Setup app | |
| translator, nlp, retriever = utils.setup_app(video_option_joined_path, emb_model, model, _logger) | |
| # -- 6. Setup model | |
| together.api_key = os.environ["TOGETHER_API_KEY"] | |
| #together.Models.start(model) | |
| return together, translator, nlp, retriever, video_option, video_option_joined_path, default_system_prompt, youtube_video_url | |
| def clean_chat(): | |
| st.session_state.conversation = None | |
| st.session_state.chat_history = None | |
| st.session_state.messages = [{'role': 'assistant', 'content': 'Nuevo chat creado'}] | |
| def main(): | |
| args, logger = get_args() | |
| B_INST, E_INST = "[INST]", "[/INST]" | |
| B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n" | |
| PODCAST_URL_VIDEO_PATH = args.PODCAST_URL_VIDEO_PATH | |
| DEFAULT_SYSTEM_PROMPT_LINK = args.DEFAULT_SYSTEM_PROMPT_LINK | |
| TRANSCRIPTION = args.TRANSCRIPTION | |
| TRANSCRIPTION_PATH = '{}_transcription.txt'.format(TRANSCRIPTION) | |
| MODEL = args.MODEL | |
| EMB_MODEL = args.EMB_MODEL | |
| WIDTH = 50 | |
| SIDE = (100 - WIDTH) / 2 | |
| podcast_url_video_df = get_podcast_data(PODCAST_URL_VIDEO_PATH) | |
| together, translator, nlp, retriever, video_option, video_option_joined_path, default_system_prompt, youtube_video_url = get_basics_comp(EMB_MODEL, MODEL, | |
| DEFAULT_SYSTEM_PROMPT_LINK, logger, | |
| podcast_url_video_df, img_size=100) | |
| # -- 6. Setup prompt template + llm chain | |
| instruction = """CONTEXTO:/n/n {context}/n | |
| PREGUNTA: {question} | |
| RESPUESTA: """ | |
| prompt_template = utils.get_prompt(instruction, default_system_prompt, B_SYS, E_SYS, B_INST, E_INST, logger) | |
| llama_prompt = PromptTemplate( | |
| template=prompt_template, input_variables=["context", "question"] | |
| ) | |
| chain_type_kwargs = {"prompt": llama_prompt} | |
| qa_chain = utils.create_llm_chain(MODEL, retriever, chain_type_kwargs, logger, video_option_joined_path) | |
| # --------------------------------------------------------------------- | |
| _, container, _ = st.columns([SIDE, WIDTH, SIDE]) | |
| with container: | |
| st_player(utils.typewrite(youtube_video_url)) | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [] | |
| for message in st.session_state.messages: | |
| with st.chat_message(message["role"]): | |
| st.markdown(message["content"]) | |
| if prompt := st.chat_input("¡Pregunta lo que quieras!"): | |
| with st.chat_message("user"): | |
| st.markdown(prompt) | |
| st.session_state.messages.append({"role": "user", "content": prompt}) | |
| with st.chat_message("assistant"): | |
| llm_response, cleaned_prompt = qa_chain(prompt) | |
| llm_response = utils.process_llm_response(llm_response, nlp) | |
| st.markdown(llm_response) | |
| start_time_str_list = []; start_time_seconds_list = []; end_time_seconds_list = [] | |
| for response in llm_response.split('\n'): | |
| if re.search(r'(\d{2}:\d{2}:\d{2}(.\d{6})?)', response) != None: | |
| start_time_str, start_time_seconds, _, end_time_seconds = utils.add_hyperlink_and_convert_to_seconds(response, cleaned_prompt) | |
| start_time_str_list.append(start_time_str) | |
| start_time_seconds_list.append(start_time_seconds) | |
| end_time_seconds_list.append(end_time_seconds) | |
| if start_time_str_list: | |
| for start_time_seconds, start_time_str, end_time_seconds in zip(start_time_seconds_list, start_time_str_list, end_time_seconds_list): | |
| st.markdown("__Fragmento: " + start_time_str + "__") | |
| _, container, _ = st.columns([SIDE, WIDTH, SIDE]) | |
| with container: | |
| st_player(youtube_video_url.replace("?enablejsapi=1", "") + f'?start={start_time_seconds}&end={end_time_seconds}') | |
| st.session_state.messages.append({"role": "assistant", "content": llm_response}) | |
| # -- Sample: streamlit run app.py -- --DEFAULT_SYSTEM_PROMPT_LINK=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/prompts/default_system_prompt.txt --PODCAST_URL_VIDEO_PATH=https://raw.githubusercontent.com/AlbertoUAH/Castena/main/data/podcast_youtube_video.csv --TRANSCRIPTION=worldcast_roberto_vaquero --MODEL=togethercomputer/llama-2-7b-chat --EMB_MODEL=BAAI/bge-base-en-v1.5 | |
| if __name__ == '__main__': | |
| main() |