Spaces:
Running
Running
File size: 7,617 Bytes
e2af017 96bff79 e2af017 e727bfc 63c894a e727bfc 535c2d9 63c894a 535c2d9 a16cb6b 52d4b2c f2de8aa 63c894a a16cb6b c8e42cd f2de8aa 96bff79 e727bfc 3f82507 525bf5b c8e42cd 525bf5b c8e42cd 525bf5b c8e42cd 525bf5b d95d32d 378a4bc c8e42cd 525bf5b 378a4bc 525bf5b c8e42cd 525bf5b c8e42cd 525bf5b c8e42cd 525bf5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import streamlit as st
import pandas as pd
from transformers import BartForConditionalGeneration, TapexTokenizer, T5ForConditionalGeneration, T5Tokenizer
from prophet import Prophet
# Caminho para o arquivo CSS, ajuste conforme a estrutura do seu projeto
css_file = "style.css"
# Abrindo e lendo o arquivo CSS
with open(css_file, "r") as css:
css_style = css.read()
# Markdown combinado com a importação da fonte e o HTML
html_content = f"""
<style>
{css_style}
@import url('https://fonts.googleapis.com/css2?family=Kanit:wght@700&display=swap');
</style>
<div style='display: flex; flex-direction: column; align-items: flex-start;'>
<div style='display: flex; align-items: center;'>
<div style='width: 20px; height: 40px; background-color: green; margin-right: 1px;'></div>
<div style='width: 20px; height: 40px; background-color: red; margin-right: 1px;'></div>
<div style='width: 20px; height: 40px; background-color: yellow; margin-right: 20px;'></div>
<span style='font-size: 50px; font-weight: normal; font-family: "Kanit", sans-serif;'>NOSTRADAMUS</span>
</div>
<div style='text-align: left; width: 100%;'>
<span style='font-size: 20px; font-weight: normal; color: #333; font-family: "Kanit", sans-serif'>
Meta Prophet + Microsoft TAPEX</span>
</div>
</div>
"""
# Aplicar o markdown combinado no Streamlit
st.markdown(html_content, unsafe_allow_html=True)
# Inicialização de variáveis de estado
if 'all_anomalies' not in st.session_state:
st.session_state['all_anomalies'] = pd.DataFrame()
if 'history' not in st.session_state:
st.session_state['history'] = []
# Carregar os modelos de tradução e TAPEX
pt_en_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-pt-en-t5")
en_pt_translator = T5ForConditionalGeneration.from_pretrained("unicamp-dl/translation-en-pt-t5")
tapex_model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tapex_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
tokenizer = T5Tokenizer.from_pretrained("unicamp-dl/translation-pt-en-t5")
def translate(text, model, tokenizer, source_lang="pt", target_lang="en"):
input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
outputs = model.generate(input_ids)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translated_text
def response(user_question, table_data):
question_en = translate(user_question, pt_en_translator, tokenizer, source_lang="pt", target_lang="en")
encoding = tapex_tokenizer(table=table_data, query=[question_en], padding=True, return_tensors="pt", truncation=True)
outputs = tapex_model.generate(**encoding)
response_en = tapex_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
response_pt = translate(response_en, en_pt_translator, tokenizer, source_lang="en", target_lang="pt")
return response_pt
def load_data(uploaded_file):
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
elif uploaded_file.name.endswith('.xlsx'):
df = pd.read_excel(uploaded_file)
return df
def preprocess_data(df):
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file, quotechar='"', encoding='utf-8')
elif uploaded_file.name.endswith('.xlsx'):
df = pd.read_excel(uploaded_file)
# Data preprocessing for Prophet
new_df = df.iloc[2:, 9:-1].fillna(0)
new_df.columns = df.iloc[1, 9:-1]
new_df.columns = new_df.columns.str.replace(r" \(\d+\)", "", regex=True)
month_dict = {
'Jan': '01', 'Fev': '02', 'Mar': '03', 'Abr': '04',
'Mai': '05', 'Jun': '06', 'Jul': '07', 'Ago': '08',
'Set': '09', 'Out': '10', 'Nov': '11', 'Dez': '12'
}
def convert_column_name(column_name):
if column_name == 'Rótulos de Linha':
return column_name
parts = column_name.split('/')
month = parts[0].strip()
year = parts[1].strip()
year = ''.join(filter(str.isdigit, year))
month_number = month_dict.get(month, '00')
return f"{month_number}/{year}"
new_df.columns = [convert_column_name(col) for col in new_df.columns]
new_df.columns = pd.to_datetime(new_df.columns, errors='coerce')
new_df.rename(columns={new_df.columns[0]: 'Rotulo'}, inplace=True)
df_clean = new_df.copy()
return df_clean
def apply_prophet(df_clean):
# Criar um DataFrame vazio para armazenar todas as anomalias
all_anomalies = pd.DataFrame()
# Processar cada linha no DataFrame
for index, row in df_clean.iterrows():
data = pd.DataFrame({
'ds': [col for col in df_clean.columns if isinstance(col, pd.Timestamp)],
'y': row[[isinstance(col, pd.Timestamp) for col in df_clean.columns]].values
})
data = data[data['y'] > 0].reset_index(drop=True)
if data.empty or len(data) < 2:
print(f"Pulando grupo {row['Rotulo']} porque há menos de 2 observações não nulas.")
continue
try:
model = Prophet(interval_width=0.95)
model.fit(data)
except ValueError as e:
print(f"Pulando grupo {row['Rotulo']} devido a erro: {e}")
continue
future = model.make_future_dataframe(periods=12, freq='M')
forecast = model.predict(future)
num_real = len(data)
num_forecast = len(forecast)
real_values = list(data['y']) + [None] * (num_forecast - num_real)
forecast['real'] = real_values
anomalies = forecast[(forecast['real'] < forecast['yhat_lower']) | (forecast['real'] > forecast['yhat_upper'])]
anomalies['Group'] = row['Rotulo']
all_anomalies = pd.concat([all_anomalies, anomalies[['ds', 'real', 'Group']]], ignore_index=True)
# Renomear colunas e aplicar filtros
all_anomalies.rename(columns={"ds": "datetime", "real": "monetary value", "Group": "group"}, inplace=True)
all_anomalies = all_anomalies[all_anomalies['monetary value'].astype(float) >= 10000000.00]
all_anomalies['monetary value'] = all_anomalies['monetary value'].apply(lambda x: f"{x:.2f}")
all_anomalies.sort_values(by=['monetary value'], ascending=False, inplace=True)
all_anomalies = all_anomalies.fillna('').astype(str)
return all_anomalies
# Interface para carregar arquivo
uploaded_file = st.file_uploader("Carregue um arquivo CSV ou XLSX", type=['csv', 'xlsx'])
if uploaded_file:
if 'all_anomalies' not in st.session_state:
df = load_data(uploaded_file)
df = preprocess_data(df)
with st.spinner('Aplicando modelo de série temporal...'):
all_anomalies = apply_prophet(df)
st.session_state['all_anomalies'] = all_anomalies
st.session_state['all_anomalies'] = all_anomalies
# Interface para perguntas do usuário
user_question = st.text_input("Escreva sua questão aqui:", "")
if user_question:
bot_response = response(user_question, st.session_state['all_anomalies'])
st.session_state['history'].append(('👤', user_question))
st.session_state['history'].append(('🤖', bot_response))
# Mostrar histórico de conversa
for sender, message in st.session_state['history']:
if sender == '👤':
st.markdown(f"**👤 {message}**")
elif sender == '🤖':
st.markdown(f"**🤖 {message}**", unsafe_allow_html=True)
# Botão para limpar histórico
if st.button("Limpar histórico"):
st.session_state['history'] = [] |