Spaces:
Runtime error
Runtime error
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.common.exceptions import TimeoutException, NoSuchElementException | |
import time | |
import pandas as pd | |
import os | |
import logging | |
logger = logging.getLogger(__name__) | |
def comprehensive_scroll(driver): | |
"""Scroll until no more new content is loaded""" | |
try: | |
last_height = driver.execute_script("return document.body.scrollHeight") | |
while True: | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(3) | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height: | |
break | |
last_height = new_height | |
except Exception as e: | |
logger.error(f"Scroll sırasında hata: {str(e)}") | |
def scrape_reviews(url): | |
"""URL'den yorumları çeken fonksiyon""" | |
driver = None | |
try: | |
# Data directory oluştur | |
data_directory = "data" | |
if not os.path.exists(data_directory): | |
os.makedirs(data_directory) | |
# Chrome options ayarları | |
chrome_options = webdriver.ChromeOptions() | |
chrome_options.add_argument('--headless') | |
chrome_options.add_argument('--disable-gpu') | |
chrome_options.add_argument('--no-sandbox') | |
chrome_options.add_argument('--disable-dev-shm-usage') | |
chrome_options.add_argument("--window-size=1920,1080") | |
# Linux için ChromeDriver ayarı | |
try: | |
# Önce /usr/local/bin/chromedriver'ı dene | |
service = Service('/usr/local/bin/chromedriver') | |
driver = webdriver.Chrome(service=service, options=chrome_options) | |
except: | |
try: | |
# Eğer başarısız olursa /usr/bin/chromedriver'ı dene | |
service = Service('/usr/bin/chromedriver') | |
driver = webdriver.Chrome(service=service, options=chrome_options) | |
except: | |
# Son olarak PATH'teki chromedriver'ı dene | |
service = Service('chromedriver') | |
driver = webdriver.Chrome(service=service, options=chrome_options) | |
logger.info(f"URL'ye erişiliyor: {url}") | |
driver.get(url) | |
# Çerez popup'ını kabul et | |
try: | |
WebDriverWait(driver, 10).until( | |
EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler')) | |
).click() | |
logger.info("Çerez popup'ı kabul edildi") | |
except TimeoutException: | |
logger.warning("Çerez popup'ı bulunamadı veya tıklanamadı") | |
logger.info("Sayfa kaydırılıyor...") | |
comprehensive_scroll(driver) | |
logger.info("Yorumlar toplanıyor...") | |
comment_elements = driver.find_elements(By.XPATH, '/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div') | |
total_comments = len(comment_elements) | |
logger.info(f"Toplam {total_comments} yorum bulundu") | |
data = [] | |
for i in range(1, total_comments + 1): | |
try: | |
username_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[1]' | |
username = driver.find_element(By.XPATH, username_xpath).text | |
except NoSuchElementException: | |
username = "N/A" | |
try: | |
comment_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[2]/p' | |
comment = driver.find_element(By.XPATH, comment_xpath).text | |
except NoSuchElementException: | |
comment = "N/A" | |
try: | |
date_xpath = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[2]/div[2]' | |
date = driver.find_element(By.XPATH, date_xpath).text | |
except NoSuchElementException: | |
date = "N/A" | |
try: | |
star_xpath_base = f'/html/body/div[1]/div[4]/div/div/div/div/div[3]/div/div/div[3]/div[2]/div[{i}]/div[1]/div[1]/div' | |
full_stars = driver.find_elements(By.XPATH, f"{star_xpath_base}/div[@class='star-w']/div[@class='full'][@style='width: 100%; max-width: 100%;']") | |
star_count = len(full_stars) | |
except NoSuchElementException: | |
star_count = 0 | |
data.append({ | |
"Kullanıcı_id": i, | |
"Kullanıcı Adı": username, | |
"Yorum": comment, | |
"Tarih": date, | |
"Yıldız Sayısı": star_count | |
}) | |
if i % 10 == 0: | |
logger.info(f"{i}/{total_comments} yorum toplandı") | |
df = pd.DataFrame(data) | |
# Geçici dosya olarak kaydet | |
temp_file = os.path.join(data_directory, 'temp_comments.csv') | |
df.to_csv(temp_file, index=False, encoding='utf-8-sig') | |
logger.info(f"Veriler {temp_file} dosyasına kaydedildi") | |
return df | |
except Exception as e: | |
logger.error(f"Veri çekme sırasında hata: {str(e)}") | |
return pd.DataFrame() | |
finally: | |
if driver: | |
driver.quit() | |
logger.info("Chrome driver kapatıldı") | |
# Geçici dosyayı sil | |
temp_file = os.path.join("data", 'temp_comments.csv') | |
if os.path.exists(temp_file): | |
os.remove(temp_file) | |
logger.info("Geçici dosya silindi") |