Files
breakpilot-lehrer/klausur-service/backend/github_crawler_core.py
Benjamin Admin 34da9f4cda [split-required] Split 700-870 LOC files across all services
backend-lehrer (11 files):
- llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6)
- messenger_api.py (840 → 5), print_generator.py (824 → 5)
- unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4)
- llm_gateway/routes/edu_search_seeds.py (710 → 4)

klausur-service (12 files):
- ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4)
- legal_corpus_api.py (790 → 4), page_crop.py (758 → 3)
- mail/ai_service.py (747 → 4), github_crawler.py (767 → 3)
- trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4)
- dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4)

website (6 pages):
- audit-checklist (867 → 8), content (806 → 6)
- screen-flow (790 → 4), scraper (789 → 5)
- zeugnisse (776 → 5), modules (745 → 4)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-25 08:01:18 +02:00

412 lines
14 KiB
Python

"""
GitHub Crawler - Core Crawler and Downloader
GitHubCrawler for API-based repository crawling and RepositoryDownloader
for ZIP-based local extraction.
Extracted from github_crawler.py to keep files under 500 LOC.
"""
import asyncio
import logging
import os
import shutil
import tempfile
import zipfile
from fnmatch import fnmatch
from pathlib import Path
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
from urllib.parse import urlparse
import httpx
from template_sources import SourceConfig
from github_crawler_parsers import (
ExtractedDocument,
MarkdownParser,
HTMLParser,
JSONParser,
)
logger = logging.getLogger(__name__)
# Configuration
GITHUB_API_URL = "https://api.github.com"
GITLAB_API_URL = "https://gitlab.com/api/v4"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
MAX_FILE_SIZE = 1024 * 1024 # 1 MB max file size
REQUEST_TIMEOUT = 60.0
RATE_LIMIT_DELAY = 1.0
class GitHubCrawler:
"""Crawl GitHub repositories for legal templates."""
def __init__(self, token: Optional[str] = None):
self.token = token or GITHUB_TOKEN
self.headers = {
"Accept": "application/vnd.github.v3+json",
"User-Agent": "LegalTemplatesCrawler/1.0",
}
if self.token:
self.headers["Authorization"] = f"token {self.token}"
self.http_client: Optional[httpx.AsyncClient] = None
async def __aenter__(self):
self.http_client = httpx.AsyncClient(
timeout=REQUEST_TIMEOUT,
headers=self.headers,
follow_redirects=True,
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.http_client:
await self.http_client.aclose()
def _parse_repo_url(self, url: str) -> Tuple[str, str, str]:
"""Parse repository URL into owner, repo, and host."""
parsed = urlparse(url)
path_parts = parsed.path.strip('/').split('/')
if len(path_parts) < 2:
raise ValueError(f"Invalid repository URL: {url}")
owner = path_parts[0]
repo = path_parts[1].replace('.git', '')
if 'gitlab' in parsed.netloc:
host = 'gitlab'
else:
host = 'github'
return owner, repo, host
async def get_default_branch(self, owner: str, repo: str) -> str:
"""Get the default branch of a repository."""
if not self.http_client:
raise RuntimeError("Crawler not initialized. Use 'async with' context.")
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}"
response = await self.http_client.get(url)
response.raise_for_status()
data = response.json()
return data.get("default_branch", "main")
async def get_latest_commit(self, owner: str, repo: str, branch: str = "main") -> str:
"""Get the latest commit SHA for a branch."""
if not self.http_client:
raise RuntimeError("Crawler not initialized. Use 'async with' context.")
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/commits/{branch}"
response = await self.http_client.get(url)
response.raise_for_status()
data = response.json()
return data.get("sha", "")
async def list_files(
self,
owner: str,
repo: str,
path: str = "",
branch: str = "main",
patterns: List[str] = None,
exclude_patterns: List[str] = None,
) -> List[Dict[str, Any]]:
"""List files in a repository matching the given patterns."""
if not self.http_client:
raise RuntimeError("Crawler not initialized. Use 'async with' context.")
patterns = patterns or ["*.md", "*.txt", "*.html"]
exclude_patterns = exclude_patterns or []
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
response = await self.http_client.get(url)
response.raise_for_status()
data = response.json()
files = []
for item in data.get("tree", []):
if item["type"] != "blob":
continue
file_path = item["path"]
excluded = any(fnmatch(file_path, pattern) for pattern in exclude_patterns)
if excluded:
continue
matched = any(fnmatch(file_path, pattern) for pattern in patterns)
if not matched:
continue
if item.get("size", 0) > MAX_FILE_SIZE:
logger.warning(f"Skipping large file: {file_path} ({item['size']} bytes)")
continue
files.append({
"path": file_path,
"sha": item["sha"],
"size": item.get("size", 0),
"url": item.get("url", ""),
})
return files
async def get_file_content(self, owner: str, repo: str, path: str, branch: str = "main") -> str:
"""Get the content of a file from a repository."""
if not self.http_client:
raise RuntimeError("Crawler not initialized. Use 'async with' context.")
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
response = await self.http_client.get(url)
response.raise_for_status()
return response.text
async def crawl_repository(
self,
source: SourceConfig,
) -> AsyncGenerator[ExtractedDocument, None]:
"""Crawl a repository and yield extracted documents."""
if not source.repo_url:
logger.warning(f"No repo URL for source: {source.name}")
return
try:
owner, repo, host = self._parse_repo_url(source.repo_url)
except ValueError as e:
logger.error(f"Failed to parse repo URL for {source.name}: {e}")
return
if host == "gitlab":
logger.info(f"GitLab repos not yet supported: {source.name}")
return
logger.info(f"Crawling repository: {owner}/{repo}")
try:
branch = await self.get_default_branch(owner, repo)
commit_sha = await self.get_latest_commit(owner, repo, branch)
await asyncio.sleep(RATE_LIMIT_DELAY)
files = await self.list_files(
owner, repo,
branch=branch,
patterns=source.file_patterns,
exclude_patterns=source.exclude_patterns,
)
logger.info(f"Found {len(files)} matching files in {source.name}")
for file_info in files:
await asyncio.sleep(RATE_LIMIT_DELAY)
try:
content = await self.get_file_content(
owner, repo, file_info["path"], branch
)
file_path = file_info["path"]
source_url = f"https://github.com/{owner}/{repo}/blob/{branch}/{file_path}"
if file_path.endswith('.md'):
doc = MarkdownParser.parse(content, file_path)
doc.source_url = source_url
doc.source_commit = commit_sha
yield doc
elif file_path.endswith('.html') or file_path.endswith('.htm'):
doc = HTMLParser.parse(content, file_path)
doc.source_url = source_url
doc.source_commit = commit_sha
yield doc
elif file_path.endswith('.json'):
docs = JSONParser.parse(content, file_path)
for doc in docs:
doc.source_url = source_url
doc.source_commit = commit_sha
yield doc
elif file_path.endswith('.txt'):
yield ExtractedDocument(
text=content,
title=Path(file_path).stem,
file_path=file_path,
file_type="text",
source_url=source_url,
source_commit=commit_sha,
language=MarkdownParser._detect_language(content),
placeholders=MarkdownParser._find_placeholders(content),
)
except httpx.HTTPError as e:
logger.warning(f"Failed to fetch {file_path}: {e}")
continue
except Exception as e:
logger.error(f"Error processing {file_path}: {e}")
continue
except httpx.HTTPError as e:
logger.error(f"HTTP error crawling {source.name}: {e}")
except Exception as e:
logger.error(f"Error crawling {source.name}: {e}")
class RepositoryDownloader:
"""Download and extract repository archives."""
def __init__(self):
self.http_client: Optional[httpx.AsyncClient] = None
async def __aenter__(self):
self.http_client = httpx.AsyncClient(
timeout=120.0,
follow_redirects=True,
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
if self.http_client:
await self.http_client.aclose()
async def download_zip(self, repo_url: str, branch: str = "main") -> Path:
"""Download repository as ZIP and extract to temp directory."""
if not self.http_client:
raise RuntimeError("Downloader not initialized. Use 'async with' context.")
parsed = urlparse(repo_url)
path_parts = parsed.path.strip('/').split('/')
owner = path_parts[0]
repo = path_parts[1].replace('.git', '')
zip_url = f"https://github.com/{owner}/{repo}/archive/refs/heads/{branch}.zip"
logger.info(f"Downloading ZIP from {zip_url}")
response = await self.http_client.get(zip_url)
response.raise_for_status()
temp_dir = Path(tempfile.mkdtemp())
zip_path = temp_dir / f"{repo}.zip"
with open(zip_path, 'wb') as f:
f.write(response.content)
extract_dir = temp_dir / repo
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
extracted_dirs = list(temp_dir.glob(f"{repo}-*"))
if extracted_dirs:
return extracted_dirs[0]
return extract_dir
async def crawl_local_directory(
self,
directory: Path,
source: SourceConfig,
base_url: str,
) -> AsyncGenerator[ExtractedDocument, None]:
"""Crawl a local directory for documents."""
patterns = source.file_patterns or ["*.md", "*.txt", "*.html"]
exclude_patterns = source.exclude_patterns or []
for pattern in patterns:
for file_path in directory.rglob(pattern.replace("**/", "")):
if not file_path.is_file():
continue
rel_path = str(file_path.relative_to(directory))
excluded = any(fnmatch(rel_path, ep) for ep in exclude_patterns)
if excluded:
continue
if file_path.stat().st_size > MAX_FILE_SIZE:
continue
try:
content = file_path.read_text(encoding='utf-8')
except UnicodeDecodeError:
try:
content = file_path.read_text(encoding='latin-1')
except Exception:
continue
source_url = f"{base_url}/{rel_path}"
if file_path.suffix == '.md':
doc = MarkdownParser.parse(content, rel_path)
doc.source_url = source_url
yield doc
elif file_path.suffix in ['.html', '.htm']:
doc = HTMLParser.parse(content, rel_path)
doc.source_url = source_url
yield doc
elif file_path.suffix == '.json':
docs = JSONParser.parse(content, rel_path)
for doc in docs:
doc.source_url = source_url
yield doc
elif file_path.suffix == '.txt':
yield ExtractedDocument(
text=content,
title=file_path.stem,
file_path=rel_path,
file_type="text",
source_url=source_url,
language=MarkdownParser._detect_language(content),
placeholders=MarkdownParser._find_placeholders(content),
)
def cleanup(self, directory: Path):
"""Clean up temporary directory."""
if directory.exists():
shutil.rmtree(directory, ignore_errors=True)
async def crawl_source(source: SourceConfig) -> List[ExtractedDocument]:
"""Crawl a source configuration and return all extracted documents."""
documents = []
if source.repo_url:
async with GitHubCrawler() as crawler:
async for doc in crawler.crawl_repository(source):
documents.append(doc)
return documents
# CLI for testing
async def main():
"""Test crawler with a sample source."""
from template_sources import TEMPLATE_SOURCES
source = next(s for s in TEMPLATE_SOURCES if s.name == "github-site-policy")
async with GitHubCrawler() as crawler:
count = 0
async for doc in crawler.crawl_repository(source):
count += 1
print(f"\n{'='*60}")
print(f"Title: {doc.title}")
print(f"Path: {doc.file_path}")
print(f"Type: {doc.file_type}")
print(f"Language: {doc.language}")
print(f"URL: {doc.source_url}")
print(f"Placeholders: {doc.placeholders[:5] if doc.placeholders else 'None'}")
print(f"Text preview: {doc.text[:200]}...")
print(f"\n\nTotal documents: {count}")
if __name__ == "__main__":
asyncio.run(main())