Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import io | |
from typing import List, Optional, Tuple | |
import logging | |
# Настройка логирования | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class WebsiteCategorizerApp: | |
def __init__(self): | |
self.sheet_url = "" | |
self.sheet_data = [] | |
self.current_index = 0 | |
self.categories = ["NEWS/BLOG", "E-commerce", "OTHER", "COMPANIES", "Short"] | |
self.results_data = [] | |
def convert_google_sheet_url(self, sheet_url: str) -> str: | |
try: | |
if "/edit#gid=" in sheet_url: | |
return sheet_url.replace("/edit#gid=", "/export?format=csv&gid=") | |
elif "/edit?usp=sharing" in sheet_url: | |
return sheet_url.replace("/edit?usp=sharing", "/export?format=csv") | |
elif "/edit" in sheet_url: | |
return sheet_url.replace("/edit", "/export?format=csv") | |
else: | |
return sheet_url | |
except Exception as e: | |
logger.error(f"Ошибка конвертации URL: {e}") | |
return "" | |
# def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]: | |
# try: | |
# if not sheet_url: | |
# return "❌ Ошибка: Введите URL Google таблицы", "" | |
# csv_url = self.convert_google_sheet_url(sheet_url) | |
# if not csv_url: | |
# return "❌ Ошибка: Неверный формат URL", "" | |
# df = pd.read_csv(csv_url) | |
# if df.empty: | |
# return "❌ Ошибка: Таблица пуста", "" | |
# if len(df.columns) < 2: | |
# return "❌ Ошибка: Нужно минимум 2 столбца (URL и категория)", "" | |
# self.sheet_data = [] | |
# self.results_data = [] | |
# url_column = df.columns[0] | |
# category_column = df.columns[1] | |
# for index, row in df.iterrows(): | |
# url = str(row[url_column]).strip() if pd.notna(row[url_column]) else "" | |
# category = str(row[category_column]).strip() if pd.notna(row[category_column]) else "" | |
# if url and url.lower() not in ['url', 'nan']: | |
# self.sheet_data.append({ | |
# "index": index, | |
# "url": url, | |
# "category": category if category.lower() != 'nan' else "" | |
# }) | |
# self.results_data.append({ | |
# "url": url, | |
# "category": category if category.lower() != 'nan' else "" | |
# }) | |
# if not self.sheet_data: | |
# return "❌ Ошибка: Не найдены валидные URL", "" | |
# self.current_index = 0 | |
# self.sheet_url = sheet_url | |
# return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display() | |
# except Exception as e: | |
# logger.error(f"Ошибка подключения к таблице: {e}") | |
# return f"❌ Ошибка: {str(e)}\n\nУбедитесь что таблица публичная и URL корректный", "" | |
def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]: | |
try: | |
if not sheet_url: | |
return "❌ Ошибка: Введите URL Google таблицы", "" | |
csv_url = self.convert_google_sheet_url(sheet_url) | |
if not csv_url: | |
return "❌ Ошибка: Неверный формат URL", "" | |
df = pd.read_csv(csv_url) | |
if df.empty or df.shape[1] < 1: | |
return "❌ Ошибка: Таблица пуста или нет данных", "" | |
# Always use column A for URL/title | |
url_column = df.columns[0] | |
# Find category column (case-insensitive match for 'category') | |
category_col_candidates = [c for c in df.columns if str(c).strip().lower() == "category"] | |
category_column = category_col_candidates[0] if category_col_candidates else None | |
self.sheet_data = [] | |
self.results_data = [] | |
for index, row in df.iterrows(): | |
raw_value = str(row[url_column]).strip() if pd.notna(row[url_column]) else "" | |
if not raw_value: | |
continue | |
# Detect if it's URL or title | |
if raw_value.lower().startswith("http"): | |
url = raw_value | |
else: | |
# treat as title → skip until we find an actual URL? (optional) | |
url = "" | |
# Category (if exists) | |
category = "" | |
if category_column and pd.notna(row[category_column]): | |
category = str(row[category_column]).strip() | |
# Only add if URL is valid | |
if url: | |
self.sheet_data.append({ | |
"index": index, | |
"url": url, | |
"category": category | |
}) | |
self.results_data.append({ | |
"url": url, | |
"category": category | |
}) | |
if not self.sheet_data: | |
return "❌ Ошибка: Не найдены валидные URL", "" | |
self.current_index = 0 | |
self.sheet_url = sheet_url | |
return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display() | |
except Exception as e: | |
logger.error(f"Ошибка подключения к таблице: {e}") | |
return f"❌ Ошибка: {str(e)}", "" | |
def get_current_url_for_display(self) -> str: | |
if not self.sheet_data or self.current_index >= len(self.sheet_data): | |
return "" | |
url = self.sheet_data[self.current_index]["url"] | |
if url and not url.startswith(("http://", "https://")): | |
url = "http://" + url | |
return url | |
def get_current_info(self) -> Tuple[str, str, str]: | |
if not self.sheet_data: | |
return "", "", "Нет данных" | |
if self.current_index >= len(self.sheet_data): | |
self.current_index = 0 | |
current = self.sheet_data[self.current_index] | |
return current["url"], current["category"], f"{self.current_index + 1}/{len(self.sheet_data)}" | |
def navigate_to_index(self, index: int) -> Tuple[str, str, str, str]: | |
if not self.sheet_data: | |
return "", "", "", "Нет данных" | |
index = max(0, min(index, len(self.sheet_data) - 1)) | |
self.current_index = index | |
url, category, info = self.get_current_info() | |
return url, category, info, self.get_current_url_for_display() | |
def previous_record(self) -> Tuple[str, str, str, str]: | |
if not self.sheet_data: | |
return "", "", "", "Нет данных" | |
self.current_index = (self.current_index - 1) % len(self.sheet_data) | |
return self.navigate_to_index(self.current_index) | |
def next_record(self) -> Tuple[str, str, str, str]: | |
if not self.sheet_data: | |
return "", "", "", "Нет данных" | |
self.current_index = (self.current_index + 1) % len(self.sheet_data) | |
return self.navigate_to_index(self.current_index) | |
def save_category(self, category: str) -> Tuple[str, str]: | |
if not self.sheet_data: | |
return "❌ Нет данных для сохранения", "" | |
try: | |
self.sheet_data[self.current_index]["category"] = category | |
self.results_data[self.current_index]["category"] = category | |
csv_buffer = io.StringIO() | |
pd.DataFrame(self.results_data).to_csv(csv_buffer, index=False, encoding='utf-8') | |
return f"✅ '{category}' сохранено", csv_buffer.getvalue() | |
except Exception as e: | |
logger.error(f"Ошибка сохранения категории: {e}") | |
return f"❌ Ошибка: {str(e)}", "" | |
def export_results(self) -> str: | |
if not self.results_data: | |
return "" | |
csv_buffer = io.StringIO() | |
pd.DataFrame(self.results_data).to_csv(csv_buffer, index=False, encoding='utf-8') | |
return csv_buffer.getvalue() | |
app = WebsiteCategorizerApp() | |
with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.Soft()) as demo: | |
gr.HTML("<h2 style='text-align:center;'>🌐 Категоризатор сайтов</h2>") | |
with gr.Tabs(): | |
with gr.TabItem("Категоризация"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
sheet_url_input = gr.Textbox(label="URL Google таблицы", lines=2) | |
connect_btn = gr.Button("🔗 Подключить", variant="primary") | |
connection_status = gr.HTML("") | |
with gr.Row(): | |
prev_btn = gr.Button("⬅️", elem_id="prev-btn") | |
next_btn = gr.Button("➡️", elem_id="next-btn") | |
record_info = gr.HTML("") | |
current_url_display = gr.Textbox(label="Текущий URL", interactive=False) | |
category_dropdown = gr.Dropdown(choices=app.categories, label="Категория") | |
save_status = gr.HTML("") | |
export_btn = gr.Button("📥 Скачать CSV") | |
export_file = gr.File(visible=False) | |
with gr.Column(scale=5): | |
website_viewer = gr.HTML(""" | |
<div style='height:900px;display:flex;align-items:center;justify-content:center;background:#eee;border-radius:8px;'> | |
<p>Подключите Google таблицу</p> | |
</div> | |
""") | |
with gr.TabItem("Текущая таблица"): | |
table_view = gr.DataFrame( | |
value=pd.DataFrame(app.results_data), | |
headers=["url", "category"], | |
datatype=["str", "str"], | |
interactive=True | |
) | |
refresh_table_btn = gr.Button("🔄 Обновить таблицу") | |
csv_data = gr.State("") | |
# def handle_connect(url): | |
# status, iframe_url = app.connect_to_sheet(url) | |
# if "✅" in status: | |
# url_display, category, info = app.get_current_info() | |
# iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>' | |
# return status, iframe_html, url_display, category, info | |
# else: | |
# return status, website_viewer.value, "", "", "" | |
def handle_connect(url): | |
status, iframe_url = app.connect_to_sheet(url) | |
if "✅" in status: | |
url_display, category, info = app.get_current_info() | |
# dynamically merge categories from data | |
all_categories = list(set(app.categories + [ | |
c for c in (row["category"] for row in app.results_data) if c | |
])) | |
iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>' | |
return ( | |
status, | |
iframe_html, | |
url_display, | |
gr.update(choices=all_categories, value=category), | |
info | |
) | |
else: | |
return ( | |
status, | |
website_viewer.value, | |
"", | |
gr.update(choices=app.categories, value=None), | |
"" | |
) | |
def handle_navigation(direction): | |
if direction == "next": | |
url_display, category, info, iframe_url = app.next_record() | |
else: | |
url_display, category, info, iframe_url = app.previous_record() | |
iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>' | |
return iframe_html, url_display, category, info | |
def handle_category_change(category): | |
status, csv_content = app.save_category(category) | |
return status, csv_content | |
def handle_export(): | |
csv_content = app.export_results() | |
if csv_content: | |
with open("results.csv", "w", encoding="utf-8") as f: | |
f.write(csv_content) | |
return gr.File(value="results.csv", visible=True) | |
return gr.File(visible=False) | |
def refresh_table(): | |
return pd.DataFrame(app.results_data) | |
connect_btn.click( | |
handle_connect, | |
inputs=[sheet_url_input], | |
outputs=[connection_status, website_viewer, current_url_display, category_dropdown, record_info] | |
) | |
next_btn.click(lambda: handle_navigation("next"), | |
outputs=[website_viewer, current_url_display, category_dropdown, record_info]) | |
prev_btn.click(lambda: handle_navigation("previous"), | |
outputs=[website_viewer, current_url_display, category_dropdown, record_info]) | |
category_dropdown.change(handle_category_change, | |
inputs=[category_dropdown], | |
outputs=[save_status, csv_data]) | |
export_btn.click(handle_export, outputs=[export_file]) | |
refresh_table_btn.click(refresh_table, outputs=[table_view]) | |
# JS для стрелок | |
gr.HTML(""" | |
<script> | |
document.addEventListener('keydown', function(event) { | |
if (event.key === "ArrowRight") { | |
document.getElementById('next-btn')?.click(); | |
} | |
if (event.key === "ArrowLeft") { | |
document.getElementById('prev-btn')?.click(); | |
} | |
}); | |
</script> | |
""") | |
if __name__ == "__main__": | |
demo.launch() | |