Spaces:
Sleeping
Sleeping
Commit
·
261f952
1
Parent(s):
c58ea62
progress more 44+
Browse files
app.py
CHANGED
|
@@ -14,6 +14,9 @@ from openpyxl import load_workbook
|
|
| 14 |
from openpyxl import Workbook
|
| 15 |
from openpyxl.utils.dataframe import dataframe_to_rows
|
| 16 |
from sentiment_decorators import sentiment_analysis_decorator
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Initialize pymystem3 for lemmatization
|
| 19 |
mystem = Mystem()
|
|
@@ -26,12 +29,112 @@ finbert_tone = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")
|
|
| 26 |
rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
|
| 27 |
rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def create_analysis_data(df):
|
| 30 |
analysis_data = []
|
| 31 |
for _, row in df.iterrows():
|
| 32 |
if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
| 33 |
analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
|
| 34 |
-
return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', '
|
| 35 |
|
| 36 |
# Function for lemmatizing Russian text
|
| 37 |
def lemmatize_text(text):
|
|
@@ -125,6 +228,20 @@ def fuzzy_deduplicate(df, column, threshold=65):
|
|
| 125 |
indices_to_keep.append(i)
|
| 126 |
return df.iloc[indices_to_keep]
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def process_file(uploaded_file):
|
| 130 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
|
@@ -257,7 +374,7 @@ def create_output_file(df, uploaded_file, analysis_df):
|
|
| 257 |
return output
|
| 258 |
|
| 259 |
def main():
|
| 260 |
-
st.title("... приступим к анализу... версия
|
| 261 |
|
| 262 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
| 263 |
|
|
@@ -292,7 +409,8 @@ def main():
|
|
| 292 |
# Calculate elapsed time
|
| 293 |
end_time = time.time()
|
| 294 |
elapsed_time = end_time - start_time
|
| 295 |
-
|
|
|
|
| 296 |
|
| 297 |
# Offer download of results
|
| 298 |
|
|
@@ -302,5 +420,20 @@ def main():
|
|
| 302 |
file_name="результат_анализа_новостей.xlsx",
|
| 303 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 304 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
if __name__ == "__main__":
|
| 306 |
main()
|
|
|
|
| 14 |
from openpyxl import Workbook
|
| 15 |
from openpyxl.utils.dataframe import dataframe_to_rows
|
| 16 |
from sentiment_decorators import sentiment_analysis_decorator
|
| 17 |
+
from langchain.llms import HuggingFacePipeline
|
| 18 |
+
from langchain.prompts import PromptTemplate
|
| 19 |
+
from langchain.chains import LLMChain
|
| 20 |
|
| 21 |
# Initialize pymystem3 for lemmatization
|
| 22 |
mystem = Mystem()
|
|
|
|
| 29 |
rubert1 = pipeline("sentiment-analysis", model = "DeepPavlov/rubert-base-cased")
|
| 30 |
rubert2 = pipeline("sentiment-analysis", model = "blanchefort/rubert-base-cased-sentiment")
|
| 31 |
|
| 32 |
+
def init_langchain_llm():
|
| 33 |
+
pipe = pipeline("text-generation", model="nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
|
| 34 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
| 35 |
+
return llm
|
| 36 |
+
|
| 37 |
+
# Function to estimate impact using LLM
|
| 38 |
+
def estimate_impact(llm, news_text):
|
| 39 |
+
template = """
|
| 40 |
+
Analyze the following news piece and estimate its monetary impact in Russian rubles for the next 6 months.
|
| 41 |
+
If a monetary estimate is not possible, categorize the impact as "Значительный", "Незначительный", or "Неопределенный".
|
| 42 |
+
Also provide a short reasoning (max 100 words) for your assessment.
|
| 43 |
+
|
| 44 |
+
News: {news}
|
| 45 |
+
|
| 46 |
+
Estimated Impact:
|
| 47 |
+
Reasoning:
|
| 48 |
+
"""
|
| 49 |
+
prompt = PromptTemplate(template=template, input_variables=["news"])
|
| 50 |
+
chain = LLMChain(llm=llm, prompt=prompt)
|
| 51 |
+
response = chain.run(news=news_text)
|
| 52 |
+
|
| 53 |
+
# Parse the response to extract impact and reasoning
|
| 54 |
+
# Parsing logic is very important! Might be needed to be changed
|
| 55 |
+
impact, reasoning = response.split("Reasoning:")
|
| 56 |
+
impact = impact.strip()
|
| 57 |
+
reasoning = reasoning.strip()
|
| 58 |
+
|
| 59 |
+
return impact, reasoning
|
| 60 |
+
|
| 61 |
+
def process_file_with_llm(uploaded_file, llm):
|
| 62 |
+
df = process_file(uploaded_file)
|
| 63 |
+
|
| 64 |
+
# Add new columns for LLM analysis
|
| 65 |
+
df['LLM_Impact'] = ''
|
| 66 |
+
df['LLM_Reasoning'] = ''
|
| 67 |
+
|
| 68 |
+
for index, row in df.iterrows():
|
| 69 |
+
if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
| 70 |
+
impact, reasoning = estimate_impact(llm, row['Выдержки из текста'])
|
| 71 |
+
df.at[index, 'LLM_Impact'] = impact
|
| 72 |
+
df.at[index, 'LLM_Reasoning'] = reasoning
|
| 73 |
+
|
| 74 |
+
return df
|
| 75 |
+
|
| 76 |
+
def create_output_file_with_llm(df, uploaded_file, analysis_df):
|
| 77 |
+
wb = load_workbook("sample_file.xlsx")
|
| 78 |
+
|
| 79 |
+
# Update 'Сводка' sheet
|
| 80 |
+
summary_df = pd.DataFrame({
|
| 81 |
+
'Объект': df['Объект'].unique(),
|
| 82 |
+
'Всего новостей': df.groupby('Объект').size(),
|
| 83 |
+
'Отрицательные': df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Negative').any(axis=1)].groupby('Объект').size(),
|
| 84 |
+
'Положительные': df[df[['FinBERT', 'RoBERTa', 'FinBERT-Tone']].eq('Positive').any(axis=1)].groupby('Объект').size(),
|
| 85 |
+
'Impact': df.groupby('Объект')['LLM_Impact'].agg(lambda x: x.value_counts().index[0] if x.any() else 'Неопределенный')
|
| 86 |
+
})
|
| 87 |
+
ws = wb['Сводка']
|
| 88 |
+
for r_idx, row in enumerate(dataframe_to_rows(summary_df, index=False, header=False), start=4):
|
| 89 |
+
for c_idx, value in enumerate(row, start=5):
|
| 90 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
| 91 |
+
|
| 92 |
+
# Update 'Значимые' sheet
|
| 93 |
+
significant_data = []
|
| 94 |
+
for _, row in df.iterrows():
|
| 95 |
+
if any(row[model] in ['Negative', 'Positive'] for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
| 96 |
+
sentiment = 'Negative' if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']) else 'Positive'
|
| 97 |
+
significant_data.append([row['Объект'], 'релевантен', sentiment, row['LLM_Impact'], row['Заголовок'], row['Выдержки из текста']])
|
| 98 |
+
|
| 99 |
+
ws = wb['Значимые']
|
| 100 |
+
for r_idx, row in enumerate(significant_data, start=3):
|
| 101 |
+
for c_idx, value in enumerate(row, start=3):
|
| 102 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
| 103 |
+
|
| 104 |
+
# Update 'Анализ' sheet
|
| 105 |
+
analysis_df['LLM_Reasoning'] = df['LLM_Reasoning']
|
| 106 |
+
ws = wb['Анализ']
|
| 107 |
+
for r_idx, row in enumerate(dataframe_to_rows(analysis_df, index=False, header=False), start=4):
|
| 108 |
+
for c_idx, value in enumerate(row, start=5):
|
| 109 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
| 110 |
+
|
| 111 |
+
# Copy 'Публикации' sheet from original uploaded file
|
| 112 |
+
original_df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
| 113 |
+
ws = wb['Публикации']
|
| 114 |
+
for r_idx, row in enumerate(dataframe_to_rows(original_df, index=False, header=True), start=1):
|
| 115 |
+
for c_idx, value in enumerate(row, start=1):
|
| 116 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
| 117 |
+
|
| 118 |
+
# Add 'Тех.приложение' sheet with processed data
|
| 119 |
+
if 'Тех.приложение' not in wb.sheetnames:
|
| 120 |
+
wb.create_sheet('Тех.приложение')
|
| 121 |
+
ws = wb['Тех.приложение']
|
| 122 |
+
for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), start=1):
|
| 123 |
+
for c_idx, value in enumerate(row, start=1):
|
| 124 |
+
ws.cell(row=r_idx, column=c_idx, value=value)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
output = io.BytesIO()
|
| 128 |
+
wb.save(output)
|
| 129 |
+
output.seek(0)
|
| 130 |
+
return output
|
| 131 |
+
|
| 132 |
def create_analysis_data(df):
|
| 133 |
analysis_data = []
|
| 134 |
for _, row in df.iterrows():
|
| 135 |
if any(row[model] == 'Negative' for model in ['FinBERT', 'RoBERTa', 'FinBERT-Tone']):
|
| 136 |
analysis_data.append([row['Объект'], row['Заголовок'], 'РИСК УБЫТКА', '', row['Выдержки из текста']])
|
| 137 |
+
return pd.DataFrame(analysis_data, columns=['Объект', 'Заголовок', 'Признак', 'Пояснение', 'Текст сообщения'])
|
| 138 |
|
| 139 |
# Function for lemmatizing Russian text
|
| 140 |
def lemmatize_text(text):
|
|
|
|
| 228 |
indices_to_keep.append(i)
|
| 229 |
return df.iloc[indices_to_keep]
|
| 230 |
|
| 231 |
+
def format_elapsed_time(seconds):
|
| 232 |
+
hours, remainder = divmod(int(seconds), 3600)
|
| 233 |
+
minutes, seconds = divmod(remainder, 60)
|
| 234 |
+
|
| 235 |
+
time_parts = []
|
| 236 |
+
if hours > 0:
|
| 237 |
+
time_parts.append(f"{hours} час{'ов' if hours != 1 else ''}")
|
| 238 |
+
if minutes > 0:
|
| 239 |
+
time_parts.append(f"{minutes} минут{'' if minutes == 1 else 'ы' if 2 <= minutes <= 4 else ''}")
|
| 240 |
+
if seconds > 0 or not time_parts: # always show seconds if it's the only non-zero value
|
| 241 |
+
time_parts.append(f"{seconds} секунд{'а' if seconds == 1 else 'ы' if 2 <= seconds <= 4 else ''}")
|
| 242 |
+
|
| 243 |
+
return " ".join(time_parts)
|
| 244 |
+
|
| 245 |
|
| 246 |
def process_file(uploaded_file):
|
| 247 |
df = pd.read_excel(uploaded_file, sheet_name='Публикации')
|
|
|
|
| 374 |
return output
|
| 375 |
|
| 376 |
def main():
|
| 377 |
+
st.title("... приступим к анализу... версия 44+")
|
| 378 |
|
| 379 |
uploaded_file = st.file_uploader("Выбирайте Excel-файл", type="xlsx")
|
| 380 |
|
|
|
|
| 409 |
# Calculate elapsed time
|
| 410 |
end_time = time.time()
|
| 411 |
elapsed_time = end_time - start_time
|
| 412 |
+
formatted_time = format_elapsed_time(elapsed_time)
|
| 413 |
+
st.success(f"Обработка завершена за {formatted_time}.")
|
| 414 |
|
| 415 |
# Offer download of results
|
| 416 |
|
|
|
|
| 420 |
file_name="результат_анализа_новостей.xlsx",
|
| 421 |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 422 |
)
|
| 423 |
+
|
| 424 |
+
# Add button for LLM analysis
|
| 425 |
+
if st.button("Что скажет нейросеть?"):
|
| 426 |
+
st.info("Анализ нейросетью начался. Это может занять некоторое время...")
|
| 427 |
+
llm = init_langchain_llm()
|
| 428 |
+
df_with_llm = process_file_with_llm(uploaded_file, llm)
|
| 429 |
+
output_with_llm = create_output_file_with_llm(df_with_llm, uploaded_file, analysis_df)
|
| 430 |
+
st.success("Анализ нейросетью завершен!")
|
| 431 |
+
st.download_button(
|
| 432 |
+
label="Скачать результат анализа с оценкой нейросети",
|
| 433 |
+
data=output_with_llm,
|
| 434 |
+
file_name="результат_анализа_с_нейросетью.xlsx",
|
| 435 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 436 |
+
|
| 437 |
+
|
| 438 |
if __name__ == "__main__":
|
| 439 |
main()
|