Spaces:
Running
Running
| """Repository loading and cloning.""" | |
| from pathlib import Path | |
| from typing import Callable, Optional | |
| from git import Repo, GitCommandError | |
| from coderag.config import get_settings | |
| from coderag.logging import get_logger | |
| from coderag.ingestion.validator import GitHubRepoInfo | |
| logger = get_logger(__name__) | |
| ProgressCallback = Callable[[str, int], None] | |
| class LoaderError(Exception): | |
| """Repository loading error.""" | |
| pass | |
| class RepositoryLoader: | |
| """Loads repositories from GitHub.""" | |
| def __init__(self, cache_dir: Optional[Path] = None) -> None: | |
| settings = get_settings() | |
| self.cache_dir = cache_dir or settings.ingestion.repos_cache_dir | |
| self.cache_dir.mkdir(parents=True, exist_ok=True) | |
| def get_repo_path(self, repo_info: GitHubRepoInfo) -> Path: | |
| return self.cache_dir / repo_info.owner / repo_info.name | |
| def clone_repository( | |
| self, | |
| repo_info: GitHubRepoInfo, | |
| branch: Optional[str] = None, | |
| progress_callback: Optional[ProgressCallback] = None, | |
| ) -> Path: | |
| repo_path = self.get_repo_path(repo_info) | |
| # Try branches in order: specified, repo default, main, master | |
| branches_to_try = [] | |
| if branch: | |
| branches_to_try.append(branch) | |
| if repo_info.branch and repo_info.branch not in branches_to_try: | |
| branches_to_try.append(repo_info.branch) | |
| if "main" not in branches_to_try: | |
| branches_to_try.append("main") | |
| if "master" not in branches_to_try: | |
| branches_to_try.append("master") | |
| if repo_path.exists(): | |
| logger.info("Repository exists, updating", path=str(repo_path)) | |
| return self._update_repository(repo_path, branches_to_try[0], progress_callback) | |
| if progress_callback: | |
| progress_callback("Cloning repository", 0) | |
| repo_path.parent.mkdir(parents=True, exist_ok=True) | |
| last_error = None | |
| for try_branch in branches_to_try: | |
| try: | |
| logger.info("Trying to clone", url=repo_info.clone_url, branch=try_branch) | |
| Repo.clone_from( | |
| repo_info.clone_url, | |
| repo_path, | |
| branch=try_branch, | |
| depth=1, | |
| single_branch=True, | |
| ) | |
| if progress_callback: | |
| progress_callback("Clone complete", 100) | |
| logger.info("Repository cloned", path=str(repo_path), branch=try_branch) | |
| return repo_path | |
| except GitCommandError as e: | |
| last_error = e | |
| logger.debug("Branch not found, trying next", branch=try_branch) | |
| # Clean up partial clone if any | |
| import shutil | |
| shutil.rmtree(repo_path, ignore_errors=True) | |
| continue | |
| raise LoaderError(f"Failed to clone repository (tried branches: {branches_to_try}): {last_error}") | |
| def _update_repository( | |
| self, | |
| repo_path: Path, | |
| branch: str, | |
| progress_callback: Optional[ProgressCallback] = None, | |
| ) -> Path: | |
| try: | |
| repo = Repo(repo_path) | |
| if progress_callback: | |
| progress_callback("Fetching updates", 30) | |
| repo.remotes.origin.fetch() | |
| repo.git.checkout(branch) | |
| repo.remotes.origin.pull() | |
| if progress_callback: | |
| progress_callback("Update complete", 100) | |
| logger.info("Repository updated", path=str(repo_path)) | |
| return repo_path | |
| except GitCommandError as e: | |
| logger.warning("Update failed, re-cloning", error=str(e)) | |
| import shutil | |
| shutil.rmtree(repo_path, ignore_errors=True) | |
| raise LoaderError(f"Failed to update, please re-clone: {e}") | |
| def is_cached(self, repo_info: GitHubRepoInfo) -> bool: | |
| return self.get_repo_path(repo_info).exists() | |
| def delete_cache(self, repo_info: GitHubRepoInfo) -> None: | |
| repo_path = self.get_repo_path(repo_info) | |
| if repo_path.exists(): | |
| import shutil | |
| shutil.rmtree(repo_path) | |
| logger.info("Cache deleted", path=str(repo_path)) | |