feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
--- a/document-crawler/api/init.py
+++ b/document-crawler/api/init.py
--- a/document-crawler/api/documents.py
+++ b/document-crawler/api/documents.py
@@ -0,0 +1,152 @@
+"""Document list, reclassify, archive endpoints."""
+
+import uuid
+from datetime import datetime, timezone
+from fastapi import APIRouter, HTTPException, Header, Query
+from pydantic import BaseModel
+
+from db import get_pool
+from archiver.dsms_client import archive_document
+
+router = APIRouter(tags=["documents"])
+
+
+class ClassifyUpdate(BaseModel):
+    classification: str
+
+
+class ArchiveBatch(BaseModel):
+    document_ids: list[str]
+
+
+@router.get("/documents")
+async def list_documents(
+    x_tenant_id: str = Header(...),
+    classification: str | None = Query(None),
+    extraction_status: str | None = Query(None),
+    archived: bool | None = Query(None),
+    limit: int = Query(100, le=500),
+    offset: int = Query(0),
+):
+    pool = await get_pool()
+    conditions = ["d.tenant_id = $1"]
+    params: list = [uuid.UUID(x_tenant_id)]
+    idx = 2
+
+    if classification:
+        conditions.append(f"d.classification = ${idx}")
+        params.append(classification)
+        idx += 1
+    if extraction_status:
+        conditions.append(f"d.extraction_status = ${idx}")
+        params.append(extraction_status)
+        idx += 1
+    if archived is not None:
+        conditions.append(f"d.archived = ${idx}")
+        params.append(archived)
+        idx += 1
+
+    where = " AND ".join(conditions)
+
+    async with pool.acquire() as conn:
+        total = await conn.fetchval(
+            f"SELECT COUNT(*) FROM crawler_documents d WHERE {where}", *params
+        )
+        rows = await conn.fetch(
+            f"""SELECT d.id, d.file_name, d.file_extension, d.file_size_bytes,
+                       d.classification, d.classification_confidence,
+                       d.classification_corrected, d.extraction_status,
+                       d.archived, d.ipfs_cid, d.first_seen_at, d.last_seen_at,
+                       d.version_count, s.name as source_name
+                FROM crawler_documents d
+                JOIN crawler_sources s ON d.source_id = s.id
+                WHERE {where}
+                ORDER BY d.created_at DESC
+                LIMIT ${idx} OFFSET ${idx+1}""",
+            *params, limit, offset,
+        )
+
+    return {"total": total, "documents": [dict(r) for r in rows]}
+
+
+@router.get("/documents/{doc_id}")
+async def get_document(doc_id: str, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            """SELECT d.*, s.name as source_name
+               FROM crawler_documents d
+               JOIN crawler_sources s ON d.source_id = s.id
+               WHERE d.id = $1 AND d.tenant_id = $2""",
+            uuid.UUID(doc_id), uuid.UUID(x_tenant_id),
+        )
+    if not row:
+        raise HTTPException(404, "Document not found")
+
+    result = dict(row)
+    # Include a text preview (first 500 chars)
+    if result.get("extracted_text"):
+        result["text_preview"] = result["extracted_text"][:500]
+    return result
+
+
+@router.put("/documents/{doc_id}/classify")
+async def classify_document_manually(
+    doc_id: str, body: ClassifyUpdate, x_tenant_id: str = Header(...)
+):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        result = await conn.execute(
+            """UPDATE crawler_documents SET
+                classification = $3, classification_corrected = true, updated_at = NOW()
+               WHERE id = $1 AND tenant_id = $2""",
+            uuid.UUID(doc_id), uuid.UUID(x_tenant_id), body.classification,
+        )
+    if result == "UPDATE 0":
+        raise HTTPException(404, "Document not found")
+    return {"status": "updated", "classification": body.classification, "corrected": True}
+
+
+@router.post("/documents/{doc_id}/archive")
+async def archive_single_document(doc_id: str, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            "SELECT * FROM crawler_documents WHERE id = $1 AND tenant_id = $2",
+            uuid.UUID(doc_id), uuid.UUID(x_tenant_id),
+        )
+    if not row:
+        raise HTTPException(404, "Document not found")
+    if row["archived"]:
+        return {"status": "already_archived", "ipfs_cid": row["ipfs_cid"]}
+
+    try:
+        result = await archive_document(
+            file_path=row["file_path"],
+            file_name=row["file_name"],
+            document_type=row["classification"] or "unknown",
+            document_id=str(row["id"]),
+        )
+        cid = result.get("cid")
+    except Exception as e:
+        raise HTTPException(502, f"Archival failed: {e}")
+
+    async with pool.acquire() as conn:
+        await conn.execute(
+            "UPDATE crawler_documents SET archived = true, ipfs_cid = $2, archived_at = NOW(), updated_at = NOW() WHERE id = $1",
+            uuid.UUID(doc_id), cid,
+        )
+
+    return {"status": "archived", "ipfs_cid": cid}
+
+
+@router.post("/documents/archive-batch")
+async def archive_batch(body: ArchiveBatch, x_tenant_id: str = Header(...)):
+    results = []
+    for did in body.document_ids:
+        try:
+            r = await archive_single_document(did, x_tenant_id)
+            results.append({"id": did, **r})
+        except HTTPException as e:
+            results.append({"id": did, "status": "error", "error": e.detail})
+    return {"results": results}
--- a/document-crawler/api/jobs.py
+++ b/document-crawler/api/jobs.py
@@ -0,0 +1,249 @@
+"""Crawl job management + trigger endpoints."""
+
+import asyncio
+import json
+import os
+import uuid
+from datetime import datetime, timezone
+from fastapi import APIRouter, HTTPException, Header, BackgroundTasks
+from pydantic import BaseModel
+
+from db import get_pool
+from config import settings
+from crawlers.filesystem_crawler import FilesystemCrawler
+from extractors.dispatcher import extract_text
+from classifiers.llm_classifier import classify_document
+
+router = APIRouter(tags=["jobs"])
+
+
+class JobCreate(BaseModel):
+    source_id: str
+    job_type: str = "full"  # full or delta
+
+
+async def _run_crawl_job(job_id: str, source_id: str, tenant_id: str, job_type: str):
+    """Background task that executes a crawl job."""
+    pool = await get_pool()
+
+    async with pool.acquire() as conn:
+        source = await conn.fetchrow(
+            "SELECT * FROM crawler_sources WHERE id = $1", uuid.UUID(source_id)
+        )
+        if not source:
+            await conn.execute(
+                "UPDATE crawler_jobs SET status = 'failed', error_message = 'Source not found', completed_at = NOW() WHERE id = $1",
+                uuid.UUID(job_id),
+            )
+            return
+
+        # Mark job as running
+        await conn.execute(
+            "UPDATE crawler_jobs SET status = 'running', started_at = NOW() WHERE id = $1",
+            uuid.UUID(job_id),
+        )
+
+    # Resolve path
+    source_path = source["path"]
+    if not os.path.isabs(source_path):
+        source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path)
+
+    exts = json.loads(source["file_extensions"]) if isinstance(source["file_extensions"], str) else source["file_extensions"]
+    excludes = json.loads(source["exclude_patterns"]) if isinstance(source["exclude_patterns"], str) else source["exclude_patterns"]
+
+    crawler = FilesystemCrawler(
+        base_path=source_path,
+        file_extensions=exts,
+        max_depth=source["max_depth"],
+        exclude_patterns=excludes,
+    )
+
+    files = crawler.crawl()
+
+    stats = {
+        "files_found": len(files),
+        "files_processed": 0,
+        "files_new": 0,
+        "files_changed": 0,
+        "files_skipped": 0,
+        "files_error": 0,
+    }
+
+    for crawled in files:
+        try:
+            async with pool.acquire() as conn:
+                # Check for existing document (delta detection)
+                existing = await conn.fetchrow(
+                    "SELECT id, file_hash FROM crawler_documents WHERE tenant_id = $1 AND source_id = $2 AND file_path = $3",
+                    uuid.UUID(tenant_id), uuid.UUID(source_id), crawled.file_path,
+                )
+
+                if existing:
+                    if job_type == "delta" and existing["file_hash"] == crawled.file_hash:
+                        # Unchanged — skip
+                        await conn.execute(
+                            "UPDATE crawler_documents SET last_seen_at = NOW() WHERE id = $1",
+                            existing["id"],
+                        )
+                        stats["files_skipped"] += 1
+                        stats["files_processed"] += 1
+                        continue
+                    elif existing["file_hash"] != crawled.file_hash:
+                        stats["files_changed"] += 1
+                    else:
+                        stats["files_skipped"] += 1
+                        stats["files_processed"] += 1
+                        continue
+                else:
+                    stats["files_new"] += 1
+
+                # Extract text
+                extraction_status = "completed"
+                extracted_text = ""
+                try:
+                    extracted_text = extract_text(crawled.file_path, crawled.file_extension)
+                except Exception:
+                    extraction_status = "failed"
+
+                # Classify
+                classification_result = {"classification": None, "confidence": None, "reasoning": None}
+                if extracted_text:
+                    classification_result = await classify_document(
+                        extracted_text, crawled.file_name, tenant_id
+                    )
+
+                if existing:
+                    # Update existing
+                    await conn.execute(
+                        """UPDATE crawler_documents SET
+                            job_id = $1, file_size_bytes = $2, file_hash = $3,
+                            extracted_text = $4, extraction_status = $5,
+                            classification = $6, classification_confidence = $7,
+                            classification_reasoning = $8, classification_corrected = false,
+                            last_seen_at = NOW(), version_count = version_count + 1,
+                            updated_at = NOW()
+                        WHERE id = $9""",
+                        uuid.UUID(job_id), crawled.file_size_bytes, crawled.file_hash,
+                        extracted_text, extraction_status,
+                        classification_result["classification"],
+                        classification_result["confidence"],
+                        classification_result["reasoning"],
+                        existing["id"],
+                    )
+                else:
+                    # Insert new
+                    await conn.execute(
+                        """INSERT INTO crawler_documents
+                           (tenant_id, source_id, job_id, file_path, file_name, file_extension,
+                            file_size_bytes, file_hash, extracted_text, extraction_status,
+                            classification, classification_confidence, classification_reasoning)
+                           VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13)""",
+                        uuid.UUID(tenant_id), uuid.UUID(source_id), uuid.UUID(job_id),
+                        crawled.file_path, crawled.file_name, crawled.file_extension,
+                        crawled.file_size_bytes, crawled.file_hash,
+                        extracted_text, extraction_status,
+                        classification_result["classification"],
+                        classification_result["confidence"],
+                        classification_result["reasoning"],
+                    )
+
+                stats["files_processed"] += 1
+
+        except Exception:
+            stats["files_error"] += 1
+            stats["files_processed"] += 1
+
+        # Update job progress
+        async with pool.acquire() as conn:
+            await conn.execute(
+                """UPDATE crawler_jobs SET
+                    files_found=$2, files_processed=$3, files_new=$4,
+                    files_changed=$5, files_skipped=$6, files_error=$7
+                WHERE id = $1""",
+                uuid.UUID(job_id),
+                stats["files_found"], stats["files_processed"],
+                stats["files_new"], stats["files_changed"],
+                stats["files_skipped"], stats["files_error"],
+            )
+
+    # Mark completed
+    async with pool.acquire() as conn:
+        await conn.execute(
+            "UPDATE crawler_jobs SET status = 'completed', completed_at = NOW() WHERE id = $1",
+            uuid.UUID(job_id),
+        )
+
+
+@router.post("/jobs", status_code=201)
+async def create_job(
+    body: JobCreate,
+    background_tasks: BackgroundTasks,
+    x_tenant_id: str = Header(...),
+):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        # Verify source exists
+        source = await conn.fetchrow(
+            "SELECT id FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
+            uuid.UUID(body.source_id), uuid.UUID(x_tenant_id),
+        )
+        if not source:
+            raise HTTPException(404, "Source not found")
+
+        # Check no job already running for this source
+        running = await conn.fetchval(
+            "SELECT EXISTS(SELECT 1 FROM crawler_jobs WHERE source_id = $1 AND status = 'running')",
+            uuid.UUID(body.source_id),
+        )
+        if running:
+            raise HTTPException(409, "A job is already running for this source")
+
+        row = await conn.fetchrow(
+            """INSERT INTO crawler_jobs (tenant_id, source_id, job_type)
+               VALUES ($1, $2, $3) RETURNING *""",
+            uuid.UUID(x_tenant_id), uuid.UUID(body.source_id), body.job_type,
+        )
+
+    job_id = str(row["id"])
+    background_tasks.add_task(_run_crawl_job, job_id, body.source_id, x_tenant_id, body.job_type)
+
+    return dict(row)
+
+
+@router.get("/jobs")
+async def list_jobs(x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            """SELECT j.*, s.name as source_name
+               FROM crawler_jobs j JOIN crawler_sources s ON j.source_id = s.id
+               WHERE j.tenant_id = $1 ORDER BY j.created_at DESC LIMIT 50""",
+            uuid.UUID(x_tenant_id),
+        )
+    return [dict(r) for r in rows]
+
+
+@router.get("/jobs/{job_id}")
+async def get_job(job_id: str, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            "SELECT * FROM crawler_jobs WHERE id = $1 AND tenant_id = $2",
+            uuid.UUID(job_id), uuid.UUID(x_tenant_id),
+        )
+    if not row:
+        raise HTTPException(404, "Job not found")
+    return dict(row)
+
+
+@router.post("/jobs/{job_id}/cancel")
+async def cancel_job(job_id: str, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        result = await conn.execute(
+            "UPDATE crawler_jobs SET status = 'cancelled', completed_at = NOW() WHERE id = $1 AND tenant_id = $2 AND status IN ('pending', 'running')",
+            uuid.UUID(job_id), uuid.UUID(x_tenant_id),
+        )
+    if result == "UPDATE 0":
+        raise HTTPException(404, "Job not found or not cancellable")
+    return {"status": "cancelled"}
--- a/document-crawler/api/reports.py
+++ b/document-crawler/api/reports.py
@@ -0,0 +1,99 @@
+"""Onboarding report + gap analysis endpoints."""
+
+import json
+import uuid
+from fastapi import APIRouter, HTTPException, Header
+from pydantic import BaseModel
+
+from db import get_pool
+from gap_analysis.analyzer import generate_gap_analysis
+
+router = APIRouter(tags=["reports"])
+
+
+class ReportGenerate(BaseModel):
+    job_id: str | None = None
+    company_profiles: list[str] = ["universal", "data_processor", "ai_user"]
+
+
+@router.post("/reports/generate", status_code=201)
+async def generate_report(body: ReportGenerate, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    tid = uuid.UUID(x_tenant_id)
+
+    async with pool.acquire() as conn:
+        # Count documents by classification for this tenant
+        rows = await conn.fetch(
+            """SELECT classification, COUNT(*) as cnt
+               FROM crawler_documents
+               WHERE tenant_id = $1 AND classification IS NOT NULL
+               GROUP BY classification""",
+            tid,
+        )
+        classification_counts = {r["classification"]: r["cnt"] for r in rows}
+
+        total_docs = await conn.fetchval(
+            "SELECT COUNT(*) FROM crawler_documents WHERE tenant_id = $1", tid
+        )
+
+    # Run gap analysis
+    analysis = generate_gap_analysis(classification_counts, body.company_profiles)
+
+    # Store report
+    async with pool.acquire() as conn:
+        jid = uuid.UUID(body.job_id) if body.job_id else None
+        row = await conn.fetchrow(
+            """INSERT INTO crawler_onboarding_reports
+               (tenant_id, job_id, total_documents_found, classification_breakdown, gaps, compliance_score)
+               VALUES ($1, $2, $3, $4, $5, $6)
+               RETURNING *""",
+            tid, jid, total_docs,
+            json.dumps(classification_counts),
+            json.dumps(analysis["gaps"]),
+            analysis["compliance_score"],
+        )
+
+    result = dict(row)
+    result["gap_summary"] = analysis["gap_summary"]
+    result["covered"] = analysis["covered"]
+    result["total_required"] = analysis["total_required"]
+    return result
+
+
+@router.get("/reports")
+async def list_reports(x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            "SELECT * FROM crawler_onboarding_reports WHERE tenant_id = $1 ORDER BY created_at DESC LIMIT 20",
+            uuid.UUID(x_tenant_id),
+        )
+    return [dict(r) for r in rows]
+
+
+@router.get("/reports/{report_id}")
+async def get_report(report_id: str, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            "SELECT * FROM crawler_onboarding_reports WHERE id = $1 AND tenant_id = $2",
+            uuid.UUID(report_id), uuid.UUID(x_tenant_id),
+        )
+    if not row:
+        raise HTTPException(404, "Report not found")
+
+    result = dict(row)
+    # Parse stored JSON
+    if isinstance(result.get("gaps"), str):
+        result["gaps"] = json.loads(result["gaps"])
+    if isinstance(result.get("classification_breakdown"), str):
+        result["classification_breakdown"] = json.loads(result["classification_breakdown"])
+
+    # Add computed summary
+    gaps = result.get("gaps", [])
+    result["gap_summary"] = {
+        "critical": sum(1 for g in gaps if g.get("severity") == "CRITICAL"),
+        "high": sum(1 for g in gaps if g.get("severity") == "HIGH"),
+        "medium": sum(1 for g in gaps if g.get("severity") == "MEDIUM"),
+    }
+    return result
--- a/document-crawler/api/sources.py
+++ b/document-crawler/api/sources.py
@@ -0,0 +1,148 @@
+"""Crawl source CRUD endpoints."""
+
+import json
+import os
+import uuid
+from fastapi import APIRouter, HTTPException, Header
+from pydantic import BaseModel
+
+from db import get_pool
+from config import settings
+
+router = APIRouter(tags=["sources"])
+
+
+class SourceCreate(BaseModel):
+    name: str
+    source_type: str = "local"
+    path: str
+    file_extensions: list[str] = [".pdf", ".docx", ".xlsx", ".pptx"]
+    max_depth: int = 5
+    exclude_patterns: list[str] = []
+    enabled: bool = True
+
+
+class SourceUpdate(BaseModel):
+    name: str | None = None
+    path: str | None = None
+    file_extensions: list[str] | None = None
+    max_depth: int | None = None
+    exclude_patterns: list[str] | None = None
+    enabled: bool | None = None
+
+
+@router.get("/sources")
+async def list_sources(x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        rows = await conn.fetch(
+            "SELECT * FROM crawler_sources WHERE tenant_id = $1 ORDER BY created_at DESC",
+            uuid.UUID(x_tenant_id),
+        )
+    return [dict(r) for r in rows]
+
+
+@router.post("/sources", status_code=201)
+async def create_source(body: SourceCreate, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            """INSERT INTO crawler_sources
+               (tenant_id, name, source_type, path, file_extensions, max_depth, exclude_patterns, enabled)
+               VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
+               RETURNING *""",
+            uuid.UUID(x_tenant_id),
+            body.name,
+            body.source_type,
+            body.path,
+            json.dumps(body.file_extensions),
+            body.max_depth,
+            json.dumps(body.exclude_patterns),
+            body.enabled,
+        )
+    return dict(row)
+
+
+@router.put("/sources/{source_id}")
+async def update_source(source_id: str, body: SourceUpdate, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        existing = await conn.fetchrow(
+            "SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
+            uuid.UUID(source_id), uuid.UUID(x_tenant_id),
+        )
+        if not existing:
+            raise HTTPException(404, "Source not found")
+
+        updates = {}
+        if body.name is not None:
+            updates["name"] = body.name
+        if body.path is not None:
+            updates["path"] = body.path
+        if body.file_extensions is not None:
+            updates["file_extensions"] = json.dumps(body.file_extensions)
+        if body.max_depth is not None:
+            updates["max_depth"] = body.max_depth
+        if body.exclude_patterns is not None:
+            updates["exclude_patterns"] = json.dumps(body.exclude_patterns)
+        if body.enabled is not None:
+            updates["enabled"] = body.enabled
+
+        if updates:
+            set_clause = ", ".join(f"{k} = ${i+3}" for i, k in enumerate(updates))
+            sql = f"UPDATE crawler_sources SET {set_clause}, updated_at = NOW() WHERE id = $1 AND tenant_id = $2 RETURNING *"
+            row = await conn.fetchrow(
+                sql, uuid.UUID(source_id), uuid.UUID(x_tenant_id), *updates.values()
+            )
+            return dict(row)
+
+        return dict(existing)
+
+
+@router.delete("/sources/{source_id}", status_code=204)
+async def delete_source(source_id: str, x_tenant_id: str = Header(...)):
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        result = await conn.execute(
+            "DELETE FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
+            uuid.UUID(source_id), uuid.UUID(x_tenant_id),
+        )
+        if result == "DELETE 0":
+            raise HTTPException(404, "Source not found")
+
+
+@router.post("/sources/{source_id}/test")
+async def test_source(source_id: str, x_tenant_id: str = Header(...)):
+    """Test connectivity to a crawl source."""
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        row = await conn.fetchrow(
+            "SELECT * FROM crawler_sources WHERE id = $1 AND tenant_id = $2",
+            uuid.UUID(source_id), uuid.UUID(x_tenant_id),
+        )
+    if not row:
+        raise HTTPException(404, "Source not found")
+
+    # For local sources, check if the path exists inside the container
+    source_path = row["path"]
+    # Resolve relative to CRAWL_BASE_PATH
+    if not os.path.isabs(source_path):
+        source_path = os.path.join(settings.CRAWL_BASE_PATH, source_path)
+
+    exists = os.path.isdir(source_path)
+    file_count = 0
+    if exists:
+        exts = json.loads(row["file_extensions"]) if isinstance(row["file_extensions"], str) else row["file_extensions"]
+        for root, dirs, files in os.walk(source_path):
+            for f in files:
+                _, ext = os.path.splitext(f)
+                if ext.lower() in exts:
+                    file_count += 1
+            break  # only top-level for test
+
+    return {
+        "reachable": exists,
+        "path_resolved": source_path,
+        "sample_file_count": file_count,
+        "message": "Pfad erreichbar" if exists else "Pfad nicht gefunden",
+    }