import gradio as gr import pandas as pd import io from typing import List, Optional, Tuple import logging # Настройка логирования logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class WebsiteCategorizerApp: def __init__(self): self.sheet_url = "" self.sheet_data = [] self.current_index = 0 self.categories = ["NEWS/BLOG", "E-commerce", "OTHER", "COMPANIES", "Short"] self.results_data = [] def convert_google_sheet_url(self, sheet_url: str) -> str: try: if "/edit#gid=" in sheet_url: return sheet_url.replace("/edit#gid=", "/export?format=csv&gid=") elif "/edit?usp=sharing" in sheet_url: return sheet_url.replace("/edit?usp=sharing", "/export?format=csv") elif "/edit" in sheet_url: return sheet_url.replace("/edit", "/export?format=csv") else: return sheet_url except Exception as e: logger.error(f"Ошибка конвертации URL: {e}") return "" # def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]: # try: # if not sheet_url: # return "❌ Ошибка: Введите URL Google таблицы", "" # csv_url = self.convert_google_sheet_url(sheet_url) # if not csv_url: # return "❌ Ошибка: Неверный формат URL", "" # df = pd.read_csv(csv_url) # if df.empty: # return "❌ Ошибка: Таблица пуста", "" # if len(df.columns) < 2: # return "❌ Ошибка: Нужно минимум 2 столбца (URL и категория)", "" # self.sheet_data = [] # self.results_data = [] # url_column = df.columns[0] # category_column = df.columns[1] # for index, row in df.iterrows(): # url = str(row[url_column]).strip() if pd.notna(row[url_column]) else "" # category = str(row[category_column]).strip() if pd.notna(row[category_column]) else "" # if url and url.lower() not in ['url', 'nan']: # self.sheet_data.append({ # "index": index, # "url": url, # "category": category if category.lower() != 'nan' else "" # }) # self.results_data.append({ # "url": url, # "category": category if category.lower() != 'nan' else "" # }) # if not self.sheet_data: # return "❌ Ошибка: Не найдены валидные URL", "" # self.current_index = 0 # self.sheet_url = sheet_url # return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display() # except Exception as e: # logger.error(f"Ошибка подключения к таблице: {e}") # return f"❌ Ошибка: {str(e)}\n\nУбедитесь что таблица публичная и URL корректный", "" def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]: try: if not sheet_url: return "❌ Ошибка: Введите URL Google таблицы", "" csv_url = self.convert_google_sheet_url(sheet_url) if not csv_url: return "❌ Ошибка: Неверный формат URL", "" df = pd.read_csv(csv_url) if df.empty or df.shape[1] < 1: return "❌ Ошибка: Таблица пуста или нет данных", "" # Always use column A for URL/title url_column = df.columns[0] # Find category column (case-insensitive match for 'category') category_col_candidates = [c for c in df.columns if str(c).strip().lower() == "category"] category_column = category_col_candidates[0] if category_col_candidates else None self.sheet_data = [] self.results_data = [] for index, row in df.iterrows(): raw_value = str(row[url_column]).strip() if pd.notna(row[url_column]) else "" if not raw_value: continue # Detect if it's URL or title if raw_value.lower().startswith("http"): url = raw_value else: # treat as title → skip until we find an actual URL? (optional) url = "" # Category (if exists) category = "" if category_column and pd.notna(row[category_column]): category = str(row[category_column]).strip() # Only add if URL is valid if url: self.sheet_data.append({ "index": index, "url": url, "category": category }) self.results_data.append({ "url": url, "category": category }) if not self.sheet_data: return "❌ Ошибка: Не найдены валидные URL", "" self.current_index = 0 self.sheet_url = sheet_url return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display() except Exception as e: logger.error(f"Ошибка подключения к таблице: {e}") return f"❌ Ошибка: {str(e)}", "" def get_current_url_for_display(self) -> str: if not self.sheet_data or self.current_index >= len(self.sheet_data): return "" url = self.sheet_data[self.current_index]["url"] if url and not url.startswith(("http://", "https://")): url = "http://" + url return url def get_current_info(self) -> Tuple[str, str, str]: if not self.sheet_data: return "", "", "Нет данных" if self.current_index >= len(self.sheet_data): self.current_index = 0 current = self.sheet_data[self.current_index] return current["url"], current["category"], f"{self.current_index + 1}/{len(self.sheet_data)}" def navigate_to_index(self, index: int) -> Tuple[str, str, str, str]: if not self.sheet_data: return "", "", "", "Нет данных" index = max(0, min(index, len(self.sheet_data) - 1)) self.current_index = index url, category, info = self.get_current_info() return url, category, info, self.get_current_url_for_display() def previous_record(self) -> Tuple[str, str, str, str]: if not self.sheet_data: return "", "", "", "Нет данных" self.current_index = (self.current_index - 1) % len(self.sheet_data) return self.navigate_to_index(self.current_index) def next_record(self) -> Tuple[str, str, str, str]: if not self.sheet_data: return "", "", "", "Нет данных" self.current_index = (self.current_index + 1) % len(self.sheet_data) return self.navigate_to_index(self.current_index) def save_category(self, category: str) -> Tuple[str, str]: if not self.sheet_data: return "❌ Нет данных для сохранения", "" try: self.sheet_data[self.current_index]["category"] = category self.results_data[self.current_index]["category"] = category csv_buffer = io.StringIO() pd.DataFrame(self.results_data).to_csv(csv_buffer, index=False, encoding='utf-8') return f"✅ '{category}' сохранено", csv_buffer.getvalue() except Exception as e: logger.error(f"Ошибка сохранения категории: {e}") return f"❌ Ошибка: {str(e)}", "" def export_results(self) -> str: if not self.results_data: return "" csv_buffer = io.StringIO() pd.DataFrame(self.results_data).to_csv(csv_buffer, index=False, encoding='utf-8') return csv_buffer.getvalue() app = WebsiteCategorizerApp() with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.Soft()) as demo: gr.HTML("

🌐 Категоризатор сайтов

") with gr.Tabs(): with gr.TabItem("Категоризация"): with gr.Row(): with gr.Column(scale=1): sheet_url_input = gr.Textbox(label="URL Google таблицы", lines=2) connect_btn = gr.Button("🔗 Подключить", variant="primary") connection_status = gr.HTML("") with gr.Row(): prev_btn = gr.Button("⬅️", elem_id="prev-btn") next_btn = gr.Button("➡️", elem_id="next-btn") record_info = gr.HTML("") current_url_display = gr.Textbox(label="Текущий URL", interactive=False) category_dropdown = gr.Dropdown(choices=app.categories, label="Категория") save_status = gr.HTML("") export_btn = gr.Button("📥 Скачать CSV") export_file = gr.File(visible=False) with gr.Column(scale=5): website_viewer = gr.HTML("""

Подключите Google таблицу

""") with gr.TabItem("Текущая таблица"): table_view = gr.DataFrame( value=pd.DataFrame(app.results_data), headers=["url", "category"], datatype=["str", "str"], interactive=True ) refresh_table_btn = gr.Button("🔄 Обновить таблицу") csv_data = gr.State("") # def handle_connect(url): # status, iframe_url = app.connect_to_sheet(url) # if "✅" in status: # url_display, category, info = app.get_current_info() # iframe_html = f'' # return status, iframe_html, url_display, category, info # else: # return status, website_viewer.value, "", "", "" def handle_connect(url): status, iframe_url = app.connect_to_sheet(url) if "✅" in status: url_display, category, info = app.get_current_info() # dynamically merge categories from data all_categories = list(set(app.categories + [ c for c in (row["category"] for row in app.results_data) if c ])) iframe_html = f'' return ( status, iframe_html, url_display, gr.update(choices=all_categories, value=category), info ) else: return ( status, website_viewer.value, "", gr.update(choices=app.categories, value=None), "" ) def handle_navigation(direction): if direction == "next": url_display, category, info, iframe_url = app.next_record() else: url_display, category, info, iframe_url = app.previous_record() iframe_html = f'' return iframe_html, url_display, category, info def handle_category_change(category): status, csv_content = app.save_category(category) return status, csv_content def handle_export(): csv_content = app.export_results() if csv_content: with open("results.csv", "w", encoding="utf-8") as f: f.write(csv_content) return gr.File(value="results.csv", visible=True) return gr.File(visible=False) def refresh_table(): return pd.DataFrame(app.results_data) connect_btn.click( handle_connect, inputs=[sheet_url_input], outputs=[connection_status, website_viewer, current_url_display, category_dropdown, record_info] ) next_btn.click(lambda: handle_navigation("next"), outputs=[website_viewer, current_url_display, category_dropdown, record_info]) prev_btn.click(lambda: handle_navigation("previous"), outputs=[website_viewer, current_url_display, category_dropdown, record_info]) category_dropdown.change(handle_category_change, inputs=[category_dropdown], outputs=[save_status, csv_data]) export_btn.click(handle_export, outputs=[export_file]) refresh_table_btn.click(refresh_table, outputs=[table_view]) # JS для стрелок gr.HTML(""" """) if __name__ == "__main__": demo.launch()