backend-lehrer (11 files): - llm_gateway/routes/schools.py (867 → 5), recording_api.py (848 → 6) - messenger_api.py (840 → 5), print_generator.py (824 → 5) - unit_analytics_api.py (751 → 5), classroom/routes/context.py (726 → 4) - llm_gateway/routes/edu_search_seeds.py (710 → 4) klausur-service (12 files): - ocr_labeling_api.py (845 → 4), metrics_db.py (833 → 4) - legal_corpus_api.py (790 → 4), page_crop.py (758 → 3) - mail/ai_service.py (747 → 4), github_crawler.py (767 → 3) - trocr_service.py (730 → 4), full_compliance_pipeline.py (723 → 4) - dsfa_rag_api.py (715 → 4), ocr_pipeline_auto.py (705 → 4) website (6 pages): - audit-checklist (867 → 8), content (806 → 6) - screen-flow (790 → 4), scraper (789 → 5) - zeugnisse (776 → 5), modules (745 → 4) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
412 lines
14 KiB
Python
412 lines
14 KiB
Python
"""
|
|
GitHub Crawler - Core Crawler and Downloader
|
|
|
|
GitHubCrawler for API-based repository crawling and RepositoryDownloader
|
|
for ZIP-based local extraction.
|
|
|
|
Extracted from github_crawler.py to keep files under 500 LOC.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import zipfile
|
|
from fnmatch import fnmatch
|
|
from pathlib import Path
|
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
|
|
from template_sources import SourceConfig
|
|
from github_crawler_parsers import (
|
|
ExtractedDocument,
|
|
MarkdownParser,
|
|
HTMLParser,
|
|
JSONParser,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
GITHUB_API_URL = "https://api.github.com"
|
|
GITLAB_API_URL = "https://gitlab.com/api/v4"
|
|
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
|
|
MAX_FILE_SIZE = 1024 * 1024 # 1 MB max file size
|
|
REQUEST_TIMEOUT = 60.0
|
|
RATE_LIMIT_DELAY = 1.0
|
|
|
|
|
|
class GitHubCrawler:
|
|
"""Crawl GitHub repositories for legal templates."""
|
|
|
|
def __init__(self, token: Optional[str] = None):
|
|
self.token = token or GITHUB_TOKEN
|
|
self.headers = {
|
|
"Accept": "application/vnd.github.v3+json",
|
|
"User-Agent": "LegalTemplatesCrawler/1.0",
|
|
}
|
|
if self.token:
|
|
self.headers["Authorization"] = f"token {self.token}"
|
|
|
|
self.http_client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def __aenter__(self):
|
|
self.http_client = httpx.AsyncClient(
|
|
timeout=REQUEST_TIMEOUT,
|
|
headers=self.headers,
|
|
follow_redirects=True,
|
|
)
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
if self.http_client:
|
|
await self.http_client.aclose()
|
|
|
|
def _parse_repo_url(self, url: str) -> Tuple[str, str, str]:
|
|
"""Parse repository URL into owner, repo, and host."""
|
|
parsed = urlparse(url)
|
|
path_parts = parsed.path.strip('/').split('/')
|
|
|
|
if len(path_parts) < 2:
|
|
raise ValueError(f"Invalid repository URL: {url}")
|
|
|
|
owner = path_parts[0]
|
|
repo = path_parts[1].replace('.git', '')
|
|
|
|
if 'gitlab' in parsed.netloc:
|
|
host = 'gitlab'
|
|
else:
|
|
host = 'github'
|
|
|
|
return owner, repo, host
|
|
|
|
async def get_default_branch(self, owner: str, repo: str) -> str:
|
|
"""Get the default branch of a repository."""
|
|
if not self.http_client:
|
|
raise RuntimeError("Crawler not initialized. Use 'async with' context.")
|
|
|
|
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}"
|
|
response = await self.http_client.get(url)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("default_branch", "main")
|
|
|
|
async def get_latest_commit(self, owner: str, repo: str, branch: str = "main") -> str:
|
|
"""Get the latest commit SHA for a branch."""
|
|
if not self.http_client:
|
|
raise RuntimeError("Crawler not initialized. Use 'async with' context.")
|
|
|
|
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/commits/{branch}"
|
|
response = await self.http_client.get(url)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data.get("sha", "")
|
|
|
|
async def list_files(
|
|
self,
|
|
owner: str,
|
|
repo: str,
|
|
path: str = "",
|
|
branch: str = "main",
|
|
patterns: List[str] = None,
|
|
exclude_patterns: List[str] = None,
|
|
) -> List[Dict[str, Any]]:
|
|
"""List files in a repository matching the given patterns."""
|
|
if not self.http_client:
|
|
raise RuntimeError("Crawler not initialized. Use 'async with' context.")
|
|
|
|
patterns = patterns or ["*.md", "*.txt", "*.html"]
|
|
exclude_patterns = exclude_patterns or []
|
|
|
|
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
|
|
response = await self.http_client.get(url)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
files = []
|
|
for item in data.get("tree", []):
|
|
if item["type"] != "blob":
|
|
continue
|
|
|
|
file_path = item["path"]
|
|
|
|
excluded = any(fnmatch(file_path, pattern) for pattern in exclude_patterns)
|
|
if excluded:
|
|
continue
|
|
|
|
matched = any(fnmatch(file_path, pattern) for pattern in patterns)
|
|
if not matched:
|
|
continue
|
|
|
|
if item.get("size", 0) > MAX_FILE_SIZE:
|
|
logger.warning(f"Skipping large file: {file_path} ({item['size']} bytes)")
|
|
continue
|
|
|
|
files.append({
|
|
"path": file_path,
|
|
"sha": item["sha"],
|
|
"size": item.get("size", 0),
|
|
"url": item.get("url", ""),
|
|
})
|
|
|
|
return files
|
|
|
|
async def get_file_content(self, owner: str, repo: str, path: str, branch: str = "main") -> str:
|
|
"""Get the content of a file from a repository."""
|
|
if not self.http_client:
|
|
raise RuntimeError("Crawler not initialized. Use 'async with' context.")
|
|
|
|
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
|
|
response = await self.http_client.get(url)
|
|
response.raise_for_status()
|
|
return response.text
|
|
|
|
async def crawl_repository(
|
|
self,
|
|
source: SourceConfig,
|
|
) -> AsyncGenerator[ExtractedDocument, None]:
|
|
"""Crawl a repository and yield extracted documents."""
|
|
if not source.repo_url:
|
|
logger.warning(f"No repo URL for source: {source.name}")
|
|
return
|
|
|
|
try:
|
|
owner, repo, host = self._parse_repo_url(source.repo_url)
|
|
except ValueError as e:
|
|
logger.error(f"Failed to parse repo URL for {source.name}: {e}")
|
|
return
|
|
|
|
if host == "gitlab":
|
|
logger.info(f"GitLab repos not yet supported: {source.name}")
|
|
return
|
|
|
|
logger.info(f"Crawling repository: {owner}/{repo}")
|
|
|
|
try:
|
|
branch = await self.get_default_branch(owner, repo)
|
|
commit_sha = await self.get_latest_commit(owner, repo, branch)
|
|
|
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
|
|
|
files = await self.list_files(
|
|
owner, repo,
|
|
branch=branch,
|
|
patterns=source.file_patterns,
|
|
exclude_patterns=source.exclude_patterns,
|
|
)
|
|
|
|
logger.info(f"Found {len(files)} matching files in {source.name}")
|
|
|
|
for file_info in files:
|
|
await asyncio.sleep(RATE_LIMIT_DELAY)
|
|
|
|
try:
|
|
content = await self.get_file_content(
|
|
owner, repo, file_info["path"], branch
|
|
)
|
|
|
|
file_path = file_info["path"]
|
|
source_url = f"https://github.com/{owner}/{repo}/blob/{branch}/{file_path}"
|
|
|
|
if file_path.endswith('.md'):
|
|
doc = MarkdownParser.parse(content, file_path)
|
|
doc.source_url = source_url
|
|
doc.source_commit = commit_sha
|
|
yield doc
|
|
|
|
elif file_path.endswith('.html') or file_path.endswith('.htm'):
|
|
doc = HTMLParser.parse(content, file_path)
|
|
doc.source_url = source_url
|
|
doc.source_commit = commit_sha
|
|
yield doc
|
|
|
|
elif file_path.endswith('.json'):
|
|
docs = JSONParser.parse(content, file_path)
|
|
for doc in docs:
|
|
doc.source_url = source_url
|
|
doc.source_commit = commit_sha
|
|
yield doc
|
|
|
|
elif file_path.endswith('.txt'):
|
|
yield ExtractedDocument(
|
|
text=content,
|
|
title=Path(file_path).stem,
|
|
file_path=file_path,
|
|
file_type="text",
|
|
source_url=source_url,
|
|
source_commit=commit_sha,
|
|
language=MarkdownParser._detect_language(content),
|
|
placeholders=MarkdownParser._find_placeholders(content),
|
|
)
|
|
|
|
except httpx.HTTPError as e:
|
|
logger.warning(f"Failed to fetch {file_path}: {e}")
|
|
continue
|
|
except Exception as e:
|
|
logger.error(f"Error processing {file_path}: {e}")
|
|
continue
|
|
|
|
except httpx.HTTPError as e:
|
|
logger.error(f"HTTP error crawling {source.name}: {e}")
|
|
except Exception as e:
|
|
logger.error(f"Error crawling {source.name}: {e}")
|
|
|
|
|
|
class RepositoryDownloader:
|
|
"""Download and extract repository archives."""
|
|
|
|
def __init__(self):
|
|
self.http_client: Optional[httpx.AsyncClient] = None
|
|
|
|
async def __aenter__(self):
|
|
self.http_client = httpx.AsyncClient(
|
|
timeout=120.0,
|
|
follow_redirects=True,
|
|
)
|
|
return self
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
if self.http_client:
|
|
await self.http_client.aclose()
|
|
|
|
async def download_zip(self, repo_url: str, branch: str = "main") -> Path:
|
|
"""Download repository as ZIP and extract to temp directory."""
|
|
if not self.http_client:
|
|
raise RuntimeError("Downloader not initialized. Use 'async with' context.")
|
|
|
|
parsed = urlparse(repo_url)
|
|
path_parts = parsed.path.strip('/').split('/')
|
|
owner = path_parts[0]
|
|
repo = path_parts[1].replace('.git', '')
|
|
|
|
zip_url = f"https://github.com/{owner}/{repo}/archive/refs/heads/{branch}.zip"
|
|
|
|
logger.info(f"Downloading ZIP from {zip_url}")
|
|
|
|
response = await self.http_client.get(zip_url)
|
|
response.raise_for_status()
|
|
|
|
temp_dir = Path(tempfile.mkdtemp())
|
|
zip_path = temp_dir / f"{repo}.zip"
|
|
|
|
with open(zip_path, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
extract_dir = temp_dir / repo
|
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
zip_ref.extractall(temp_dir)
|
|
|
|
extracted_dirs = list(temp_dir.glob(f"{repo}-*"))
|
|
if extracted_dirs:
|
|
return extracted_dirs[0]
|
|
|
|
return extract_dir
|
|
|
|
async def crawl_local_directory(
|
|
self,
|
|
directory: Path,
|
|
source: SourceConfig,
|
|
base_url: str,
|
|
) -> AsyncGenerator[ExtractedDocument, None]:
|
|
"""Crawl a local directory for documents."""
|
|
patterns = source.file_patterns or ["*.md", "*.txt", "*.html"]
|
|
exclude_patterns = source.exclude_patterns or []
|
|
|
|
for pattern in patterns:
|
|
for file_path in directory.rglob(pattern.replace("**/", "")):
|
|
if not file_path.is_file():
|
|
continue
|
|
|
|
rel_path = str(file_path.relative_to(directory))
|
|
|
|
excluded = any(fnmatch(rel_path, ep) for ep in exclude_patterns)
|
|
if excluded:
|
|
continue
|
|
|
|
if file_path.stat().st_size > MAX_FILE_SIZE:
|
|
continue
|
|
|
|
try:
|
|
content = file_path.read_text(encoding='utf-8')
|
|
except UnicodeDecodeError:
|
|
try:
|
|
content = file_path.read_text(encoding='latin-1')
|
|
except Exception:
|
|
continue
|
|
|
|
source_url = f"{base_url}/{rel_path}"
|
|
|
|
if file_path.suffix == '.md':
|
|
doc = MarkdownParser.parse(content, rel_path)
|
|
doc.source_url = source_url
|
|
yield doc
|
|
|
|
elif file_path.suffix in ['.html', '.htm']:
|
|
doc = HTMLParser.parse(content, rel_path)
|
|
doc.source_url = source_url
|
|
yield doc
|
|
|
|
elif file_path.suffix == '.json':
|
|
docs = JSONParser.parse(content, rel_path)
|
|
for doc in docs:
|
|
doc.source_url = source_url
|
|
yield doc
|
|
|
|
elif file_path.suffix == '.txt':
|
|
yield ExtractedDocument(
|
|
text=content,
|
|
title=file_path.stem,
|
|
file_path=rel_path,
|
|
file_type="text",
|
|
source_url=source_url,
|
|
language=MarkdownParser._detect_language(content),
|
|
placeholders=MarkdownParser._find_placeholders(content),
|
|
)
|
|
|
|
def cleanup(self, directory: Path):
|
|
"""Clean up temporary directory."""
|
|
if directory.exists():
|
|
shutil.rmtree(directory, ignore_errors=True)
|
|
|
|
|
|
async def crawl_source(source: SourceConfig) -> List[ExtractedDocument]:
|
|
"""Crawl a source configuration and return all extracted documents."""
|
|
documents = []
|
|
|
|
if source.repo_url:
|
|
async with GitHubCrawler() as crawler:
|
|
async for doc in crawler.crawl_repository(source):
|
|
documents.append(doc)
|
|
|
|
return documents
|
|
|
|
|
|
# CLI for testing
|
|
async def main():
|
|
"""Test crawler with a sample source."""
|
|
from template_sources import TEMPLATE_SOURCES
|
|
|
|
source = next(s for s in TEMPLATE_SOURCES if s.name == "github-site-policy")
|
|
|
|
async with GitHubCrawler() as crawler:
|
|
count = 0
|
|
async for doc in crawler.crawl_repository(source):
|
|
count += 1
|
|
print(f"\n{'='*60}")
|
|
print(f"Title: {doc.title}")
|
|
print(f"Path: {doc.file_path}")
|
|
print(f"Type: {doc.file_type}")
|
|
print(f"Language: {doc.language}")
|
|
print(f"URL: {doc.source_url}")
|
|
print(f"Placeholders: {doc.placeholders[:5] if doc.placeholders else 'None'}")
|
|
print(f"Text preview: {doc.text[:200]}...")
|
|
|
|
print(f"\n\nTotal documents: {count}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|