Sebastiangmz's picture
Update to v0.1.2
42f5b98
"""Repository loading and cloning."""
from pathlib import Path
from typing import Callable, Optional
from git import Repo, GitCommandError
from coderag.config import get_settings
from coderag.logging import get_logger
from coderag.ingestion.validator import GitHubRepoInfo
logger = get_logger(__name__)
ProgressCallback = Callable[[str, int], None]
class LoaderError(Exception):
"""Repository loading error."""
pass
class RepositoryLoader:
"""Loads repositories from GitHub."""
def __init__(self, cache_dir: Optional[Path] = None) -> None:
settings = get_settings()
self.cache_dir = cache_dir or settings.ingestion.repos_cache_dir
self.cache_dir.mkdir(parents=True, exist_ok=True)
def get_repo_path(self, repo_info: GitHubRepoInfo) -> Path:
return self.cache_dir / repo_info.owner / repo_info.name
def clone_repository(
self,
repo_info: GitHubRepoInfo,
branch: Optional[str] = None,
progress_callback: Optional[ProgressCallback] = None,
) -> Path:
repo_path = self.get_repo_path(repo_info)
# Try branches in order: specified, repo default, main, master
branches_to_try = []
if branch:
branches_to_try.append(branch)
if repo_info.branch and repo_info.branch not in branches_to_try:
branches_to_try.append(repo_info.branch)
if "main" not in branches_to_try:
branches_to_try.append("main")
if "master" not in branches_to_try:
branches_to_try.append("master")
if repo_path.exists():
logger.info("Repository exists, updating", path=str(repo_path))
return self._update_repository(repo_path, branches_to_try[0], progress_callback)
if progress_callback:
progress_callback("Cloning repository", 0)
repo_path.parent.mkdir(parents=True, exist_ok=True)
last_error = None
for try_branch in branches_to_try:
try:
logger.info("Trying to clone", url=repo_info.clone_url, branch=try_branch)
Repo.clone_from(
repo_info.clone_url,
repo_path,
branch=try_branch,
depth=1,
single_branch=True,
)
if progress_callback:
progress_callback("Clone complete", 100)
logger.info("Repository cloned", path=str(repo_path), branch=try_branch)
return repo_path
except GitCommandError as e:
last_error = e
logger.debug("Branch not found, trying next", branch=try_branch)
# Clean up partial clone if any
import shutil
shutil.rmtree(repo_path, ignore_errors=True)
continue
raise LoaderError(f"Failed to clone repository (tried branches: {branches_to_try}): {last_error}")
def _update_repository(
self,
repo_path: Path,
branch: str,
progress_callback: Optional[ProgressCallback] = None,
) -> Path:
try:
repo = Repo(repo_path)
if progress_callback:
progress_callback("Fetching updates", 30)
repo.remotes.origin.fetch()
repo.git.checkout(branch)
repo.remotes.origin.pull()
if progress_callback:
progress_callback("Update complete", 100)
logger.info("Repository updated", path=str(repo_path))
return repo_path
except GitCommandError as e:
logger.warning("Update failed, re-cloning", error=str(e))
import shutil
shutil.rmtree(repo_path, ignore_errors=True)
raise LoaderError(f"Failed to update, please re-clone: {e}")
def is_cached(self, repo_info: GitHubRepoInfo) -> bool:
return self.get_repo_path(repo_info).exists()
def delete_cache(self, repo_info: GitHubRepoInfo) -> None:
repo_path = self.get_repo_path(repo_info)
if repo_path.exists():
import shutil
shutil.rmtree(repo_path)
logger.info("Cache deleted", path=str(repo_path))