Spaces:
Running
Running
| """GitHub URL validation and parsing.""" | |
| import re | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| import httpx | |
| from coderag.logging import get_logger | |
| logger = get_logger(__name__) | |
| class GitHubRepoInfo: | |
| """Parsed GitHub repository information.""" | |
| owner: str | |
| name: str | |
| url: str | |
| branch: Optional[str] = None | |
| def full_name(self) -> str: | |
| return f"{self.owner}/{self.name}" | |
| def clone_url(self) -> str: | |
| return f"https://github.com/{self.owner}/{self.name}.git" | |
| def api_url(self) -> str: | |
| return f"https://api.github.com/repos/{self.owner}/{self.name}" | |
| class ValidationError(Exception): | |
| """URL validation error.""" | |
| pass | |
| class GitHubURLValidator: | |
| """Validates and parses GitHub repository URLs.""" | |
| GITHUB_PATTERNS = [ | |
| r"^https?://github\.com/(?P<owner>[^/]+)/(?P<name>[^/]+?)(?:\.git)?/?$", | |
| r"^git@github\.com:(?P<owner>[^/]+)/(?P<name>[^/]+?)(?:\.git)?$", | |
| r"^(?P<owner>[a-zA-Z0-9](?:[a-zA-Z0-9]|-(?=[a-zA-Z0-9])){0,38})/(?P<name>[a-zA-Z0-9._-]+)$", | |
| ] | |
| def __init__(self, timeout: float = 10.0) -> None: | |
| self.timeout = timeout | |
| self._patterns = [re.compile(p) for p in self.GITHUB_PATTERNS] | |
| def parse_url(self, url: str) -> GitHubRepoInfo: | |
| url = url.strip() | |
| for pattern in self._patterns: | |
| match = pattern.match(url) | |
| if match: | |
| owner = match.group("owner") | |
| name = match.group("name").rstrip(".git") | |
| if not self._is_valid_name(owner) or not self._is_valid_name(name): | |
| raise ValidationError(f"Invalid owner or repository name: {url}") | |
| return GitHubRepoInfo(owner=owner, name=name, url=f"https://github.com/{owner}/{name}") | |
| raise ValidationError(f"Invalid GitHub URL: {url}. Expected: https://github.com/owner/repo") | |
| def _is_valid_name(self, name: str) -> bool: | |
| if not name or len(name) > 100: | |
| return False | |
| return bool(re.match(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$", name)) | |
| async def validate_repository(self, url: str, check_accessibility: bool = True) -> GitHubRepoInfo: | |
| repo_info = self.parse_url(url) | |
| if check_accessibility: | |
| await self._check_repo_accessible(repo_info) | |
| logger.info("Repository validated", owner=repo_info.owner, name=repo_info.name) | |
| return repo_info | |
| async def _check_repo_accessible(self, repo_info: GitHubRepoInfo) -> None: | |
| async with httpx.AsyncClient(timeout=self.timeout) as client: | |
| try: | |
| response = await client.get(repo_info.api_url) | |
| if response.status_code == 404: | |
| raise ValidationError(f"Repository not found: {repo_info.full_name}") | |
| elif response.status_code == 403: | |
| raise ValidationError(f"Access denied: {repo_info.full_name}") | |
| elif response.status_code != 200: | |
| raise ValidationError(f"HTTP error {response.status_code}: {repo_info.full_name}") | |
| data = response.json() | |
| if data.get("private", False): | |
| raise ValidationError(f"Private repository not supported: {repo_info.full_name}") | |
| repo_info.branch = data.get("default_branch", "main") | |
| except httpx.TimeoutException: | |
| raise ValidationError(f"Timeout checking repository: {repo_info.full_name}") | |
| except httpx.RequestError as e: | |
| raise ValidationError(f"Network error: {str(e)}") | |
| def validate_url_sync(self, url: str) -> GitHubRepoInfo: | |
| return self.parse_url(url) | |