purpleriann's picture
Upload folder using huggingface_hub
a22e84b verified
import os
import shutil
import subprocess
import tempfile
from loguru import logger
from llm_engineering.domain.documents import RepositoryDocument
from .base import BaseCrawler
class GithubCrawler(BaseCrawler):
model = RepositoryDocument
def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
super().__init__()
self._ignore = ignore
def extract(self, link: str, **kwargs) -> None:
old_model = self.model.find(link=link)
if old_model is not None:
logger.info(f"Repository already exists in the database: {link}")
return
logger.info(f"Starting scrapping GitHub repository: {link}")
repo_name = link.rstrip("/").split("/")[-1]
local_temp = tempfile.mkdtemp()
try:
os.chdir(local_temp)
subprocess.run(["git", "clone", link])
repo_path = os.path.join(local_temp, os.listdir(local_temp)[0]) # noqa: PTH118
tree = {}
for root, _, files in os.walk(repo_path):
dir = root.replace(repo_path, "").lstrip("/")
if dir.startswith(self._ignore):
continue
for file in files:
if file.endswith(self._ignore):
continue
file_path = os.path.join(dir, file) # noqa: PTH118
with open(os.path.join(root, file), "r", errors="ignore") as f: # noqa: PTH123, PTH118
tree[file_path] = f.read().replace(" ", "")
user = kwargs["user"]
instance = self.model(
content=tree,
name=repo_name,
link=link,
platform="github",
author_id=user.id,
author_full_name=user.full_name,
)
instance.save()
except Exception:
raise
finally:
shutil.rmtree(local_temp)
logger.info(f"Finished scrapping GitHub repository: {link}")