limitedonly41's picture
Update app.py
1facdab verified
import gradio as gr
import pandas as pd
import io
from typing import List, Optional, Tuple
import logging
# Настройка логирования
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class WebsiteCategorizerApp:
def __init__(self):
self.sheet_url = ""
self.sheet_data = []
self.current_index = 0
self.categories = ["NEWS/BLOG", "E-commerce", "OTHER", "COMPANIES", "Short"]
self.results_data = []
def convert_google_sheet_url(self, sheet_url: str) -> str:
try:
if "/edit#gid=" in sheet_url:
return sheet_url.replace("/edit#gid=", "/export?format=csv&gid=")
elif "/edit?usp=sharing" in sheet_url:
return sheet_url.replace("/edit?usp=sharing", "/export?format=csv")
elif "/edit" in sheet_url:
return sheet_url.replace("/edit", "/export?format=csv")
else:
return sheet_url
except Exception as e:
logger.error(f"Ошибка конвертации URL: {e}")
return ""
# def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]:
# try:
# if not sheet_url:
# return "❌ Ошибка: Введите URL Google таблицы", ""
# csv_url = self.convert_google_sheet_url(sheet_url)
# if not csv_url:
# return "❌ Ошибка: Неверный формат URL", ""
# df = pd.read_csv(csv_url)
# if df.empty:
# return "❌ Ошибка: Таблица пуста", ""
# if len(df.columns) < 2:
# return "❌ Ошибка: Нужно минимум 2 столбца (URL и категория)", ""
# self.sheet_data = []
# self.results_data = []
# url_column = df.columns[0]
# category_column = df.columns[1]
# for index, row in df.iterrows():
# url = str(row[url_column]).strip() if pd.notna(row[url_column]) else ""
# category = str(row[category_column]).strip() if pd.notna(row[category_column]) else ""
# if url and url.lower() not in ['url', 'nan']:
# self.sheet_data.append({
# "index": index,
# "url": url,
# "category": category if category.lower() != 'nan' else ""
# })
# self.results_data.append({
# "url": url,
# "category": category if category.lower() != 'nan' else ""
# })
# if not self.sheet_data:
# return "❌ Ошибка: Не найдены валидные URL", ""
# self.current_index = 0
# self.sheet_url = sheet_url
# return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display()
# except Exception as e:
# logger.error(f"Ошибка подключения к таблице: {e}")
# return f"❌ Ошибка: {str(e)}\n\nУбедитесь что таблица публичная и URL корректный", ""
def connect_to_sheet(self, sheet_url: str) -> Tuple[str, str]:
try:
if not sheet_url:
return "❌ Ошибка: Введите URL Google таблицы", ""
csv_url = self.convert_google_sheet_url(sheet_url)
if not csv_url:
return "❌ Ошибка: Неверный формат URL", ""
df = pd.read_csv(csv_url)
if df.empty or df.shape[1] < 1:
return "❌ Ошибка: Таблица пуста или нет данных", ""
# Always use column A for URL/title
url_column = df.columns[0]
# Find category column (case-insensitive match for 'category')
category_col_candidates = [c for c in df.columns if str(c).strip().lower() == "category"]
category_column = category_col_candidates[0] if category_col_candidates else None
self.sheet_data = []
self.results_data = []
for index, row in df.iterrows():
raw_value = str(row[url_column]).strip() if pd.notna(row[url_column]) else ""
if not raw_value:
continue
# Detect if it's URL or title
if raw_value.lower().startswith("http"):
url = raw_value
else:
# treat as title → skip until we find an actual URL? (optional)
url = ""
# Category (if exists)
category = ""
if category_column and pd.notna(row[category_column]):
category = str(row[category_column]).strip()
# Only add if URL is valid
if url:
self.sheet_data.append({
"index": index,
"url": url,
"category": category
})
self.results_data.append({
"url": url,
"category": category
})
if not self.sheet_data:
return "❌ Ошибка: Не найдены валидные URL", ""
self.current_index = 0
self.sheet_url = sheet_url
return f"✅ Подключено успешно! Найдено {len(self.sheet_data)} записей", self.get_current_url_for_display()
except Exception as e:
logger.error(f"Ошибка подключения к таблице: {e}")
return f"❌ Ошибка: {str(e)}", ""
def get_current_url_for_display(self) -> str:
if not self.sheet_data or self.current_index >= len(self.sheet_data):
return ""
url = self.sheet_data[self.current_index]["url"]
if url and not url.startswith(("http://", "https://")):
url = "http://" + url
return url
def get_current_info(self) -> Tuple[str, str, str]:
if not self.sheet_data:
return "", "", "Нет данных"
if self.current_index >= len(self.sheet_data):
self.current_index = 0
current = self.sheet_data[self.current_index]
return current["url"], current["category"], f"{self.current_index + 1}/{len(self.sheet_data)}"
def navigate_to_index(self, index: int) -> Tuple[str, str, str, str]:
if not self.sheet_data:
return "", "", "", "Нет данных"
index = max(0, min(index, len(self.sheet_data) - 1))
self.current_index = index
url, category, info = self.get_current_info()
return url, category, info, self.get_current_url_for_display()
def previous_record(self) -> Tuple[str, str, str, str]:
if not self.sheet_data:
return "", "", "", "Нет данных"
self.current_index = (self.current_index - 1) % len(self.sheet_data)
return self.navigate_to_index(self.current_index)
def next_record(self) -> Tuple[str, str, str, str]:
if not self.sheet_data:
return "", "", "", "Нет данных"
self.current_index = (self.current_index + 1) % len(self.sheet_data)
return self.navigate_to_index(self.current_index)
def save_category(self, category: str) -> Tuple[str, str]:
if not self.sheet_data:
return "❌ Нет данных для сохранения", ""
try:
self.sheet_data[self.current_index]["category"] = category
self.results_data[self.current_index]["category"] = category
csv_buffer = io.StringIO()
pd.DataFrame(self.results_data).to_csv(csv_buffer, index=False, encoding='utf-8')
return f"✅ '{category}' сохранено", csv_buffer.getvalue()
except Exception as e:
logger.error(f"Ошибка сохранения категории: {e}")
return f"❌ Ошибка: {str(e)}", ""
def export_results(self) -> str:
if not self.results_data:
return ""
csv_buffer = io.StringIO()
pd.DataFrame(self.results_data).to_csv(csv_buffer, index=False, encoding='utf-8')
return csv_buffer.getvalue()
app = WebsiteCategorizerApp()
with gr.Blocks(title="Категоризатор сайтов", theme=gr.themes.Soft()) as demo:
gr.HTML("<h2 style='text-align:center;'>🌐 Категоризатор сайтов</h2>")
with gr.Tabs():
with gr.TabItem("Категоризация"):
with gr.Row():
with gr.Column(scale=1):
sheet_url_input = gr.Textbox(label="URL Google таблицы", lines=2)
connect_btn = gr.Button("🔗 Подключить", variant="primary")
connection_status = gr.HTML("")
with gr.Row():
prev_btn = gr.Button("⬅️", elem_id="prev-btn")
next_btn = gr.Button("➡️", elem_id="next-btn")
record_info = gr.HTML("")
current_url_display = gr.Textbox(label="Текущий URL", interactive=False)
category_dropdown = gr.Dropdown(choices=app.categories, label="Категория")
save_status = gr.HTML("")
export_btn = gr.Button("📥 Скачать CSV")
export_file = gr.File(visible=False)
with gr.Column(scale=5):
website_viewer = gr.HTML("""
<div style='height:900px;display:flex;align-items:center;justify-content:center;background:#eee;border-radius:8px;'>
<p>Подключите Google таблицу</p>
</div>
""")
with gr.TabItem("Текущая таблица"):
table_view = gr.DataFrame(
value=pd.DataFrame(app.results_data),
headers=["url", "category"],
datatype=["str", "str"],
interactive=True
)
refresh_table_btn = gr.Button("🔄 Обновить таблицу")
csv_data = gr.State("")
# def handle_connect(url):
# status, iframe_url = app.connect_to_sheet(url)
# if "✅" in status:
# url_display, category, info = app.get_current_info()
# iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>'
# return status, iframe_html, url_display, category, info
# else:
# return status, website_viewer.value, "", "", ""
def handle_connect(url):
status, iframe_url = app.connect_to_sheet(url)
if "✅" in status:
url_display, category, info = app.get_current_info()
# dynamically merge categories from data
all_categories = list(set(app.categories + [
c for c in (row["category"] for row in app.results_data) if c
]))
iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>'
return (
status,
iframe_html,
url_display,
gr.update(choices=all_categories, value=category),
info
)
else:
return (
status,
website_viewer.value,
"",
gr.update(choices=app.categories, value=None),
""
)
def handle_navigation(direction):
if direction == "next":
url_display, category, info, iframe_url = app.next_record()
else:
url_display, category, info, iframe_url = app.previous_record()
iframe_html = f'<iframe src="{iframe_url}" width="100%" height="900px" style="border-radius:8px;"></iframe>'
return iframe_html, url_display, category, info
def handle_category_change(category):
status, csv_content = app.save_category(category)
return status, csv_content
def handle_export():
csv_content = app.export_results()
if csv_content:
with open("results.csv", "w", encoding="utf-8") as f:
f.write(csv_content)
return gr.File(value="results.csv", visible=True)
return gr.File(visible=False)
def refresh_table():
return pd.DataFrame(app.results_data)
connect_btn.click(
handle_connect,
inputs=[sheet_url_input],
outputs=[connection_status, website_viewer, current_url_display, category_dropdown, record_info]
)
next_btn.click(lambda: handle_navigation("next"),
outputs=[website_viewer, current_url_display, category_dropdown, record_info])
prev_btn.click(lambda: handle_navigation("previous"),
outputs=[website_viewer, current_url_display, category_dropdown, record_info])
category_dropdown.change(handle_category_change,
inputs=[category_dropdown],
outputs=[save_status, csv_data])
export_btn.click(handle_export, outputs=[export_file])
refresh_table_btn.click(refresh_table, outputs=[table_view])
# JS для стрелок
gr.HTML("""
<script>
document.addEventListener('keydown', function(event) {
if (event.key === "ArrowRight") {
document.getElementById('next-btn')?.click();
}
if (event.key === "ArrowLeft") {
document.getElementById('prev-btn')?.click();
}
});
</script>
""")
if __name__ == "__main__":
demo.launch()