Files
breakpilot-compliance/dsms-gateway/routers/documents.py
T
Benjamin Admin 216c7b8eca
CI / detect-changes (push) Successful in 8s
CI / branch-name (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 10s
CI / loc-budget (push) Successful in 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m21s
CI / test-go (push) Failing after 37s
CI / iace-gt-coverage (push) Successful in 23s
CI / test-python-backend (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Successful in 17s
feat(iace): DSMS-CID-Badge im Tech-File-Export + aggregierter Bulk-Diff
Punkt 1 — UI-CID-Badge nach erfolgreichem Tech-File-Export:
- archiveTechFile setzt X-DSMS-CID / X-DSMS-Filename / X-DSMS-Size response
  headers + Access-Control-Expose-Headers, sobald DSMS-Archive durchlief
- Split iace_handler_techfile.go (war ueber 500 LOC) → archiveTechFile lebt
  jetzt in iace_handler_techfile_archive.go, setDSMSResponseHeaders als
  pure Helper mit 3 unit tests
- Next.js IACE-Proxy forwarded die X-DSMS-* Header und erkennt jetzt auch
  XLSX/DOCX/MD als Binary-Response (vorher nur PDF/ZIP/octet-stream)
- ExportCIDBadge.tsx zeigt CID, Filename, Groesse + Kopieren-Button +
  "Verlauf anzeigen" (oeffnet CIDHistoryModal)

Punkt 2 — Bulk-Diff Report V1 → V_latest:
- Neuer Endpoint GET /api/v1/documents/{cid}/bulk-diff im dsms-gateway:
  laeuft parent_cid-Kette ab, berechnet chronologische Step-Diffs,
  aggregiert Totals (added/removed lines, metadata_fields_changed,
  binary_steps). Edge-Cases: einzelne Version, binaere Steps, abgebrochene
  Kette
- BulkDiffPanel.tsx zeigt 4-Stat-Header + Step-Tabelle
- CIDHistoryModal bekommt Toggle-Button "Bulk-Diff V1 → V_latest anzeigen"
  neben dem Versions-Counter; damit auch vom IACE-Export-Badge erreichbar

Tests: 3 neue Go-Tests, 4 neue pytest-Tests, alle gruen

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-09 09:07:20 +02:00

487 lines
15 KiB
Python

"""
Documents router — handles /api/v1/documents and /api/v1/legal-documents endpoints.
"""
import hashlib
import json
import io
from datetime import datetime
from typing import Optional
import httpx
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
from fastapi.responses import StreamingResponse
from models import DocumentList, DocumentMetadata, StoredDocument
from dependencies import verify_token, ipfs_add, ipfs_cat, ipfs_pin_ls
from config import IPFS_API_URL, IPFS_GATEWAY_URL
router = APIRouter()
@router.post("/api/v1/documents", response_model=StoredDocument)
async def store_document(
file: UploadFile = File(...),
document_type: str = "legal_document",
document_id: Optional[str] = None,
version: Optional[str] = None,
language: str = "de",
_auth: dict = Depends(verify_token)
):
"""
Speichert ein Dokument im DSMS.
- **file**: Das zu speichernde Dokument
- **document_type**: Typ des Dokuments (legal_document, consent_record, audit_log)
- **document_id**: Optionale ID des Dokuments
- **version**: Optionale Versionsnummer
- **language**: Sprache (default: de)
"""
content = await file.read()
# Checksum berechnen
checksum = hashlib.sha256(content).hexdigest()
# Metadaten erstellen
metadata = DocumentMetadata(
document_type=document_type,
document_id=document_id,
version=version,
language=language,
created_at=datetime.utcnow().isoformat(),
checksum=checksum,
encrypted=False
)
# Dokument mit Metadaten als JSON verpacken
package = {
"metadata": metadata.model_dump(),
"content_base64": content.hex(), # Hex-encodiert für JSON
"filename": file.filename
}
package_bytes = json.dumps(package).encode()
# Zu IPFS hinzufügen
result = await ipfs_add(package_bytes)
cid = result.get("Hash")
size = int(result.get("Size", 0))
return StoredDocument(
cid=cid,
size=size,
metadata=metadata,
gateway_url=f"{IPFS_GATEWAY_URL}/ipfs/{cid}",
timestamp=datetime.utcnow().isoformat()
)
@router.get("/api/v1/documents/{cid}")
async def get_document(
cid: str,
_auth: dict = Depends(verify_token)
):
"""
Ruft ein Dokument aus dem DSMS ab.
- **cid**: Content Identifier (IPFS Hash)
"""
content = await ipfs_cat(cid)
try:
package = json.loads(content)
metadata = package.get("metadata", {})
original_content = bytes.fromhex(package.get("content_base64", ""))
filename = package.get("filename", "document")
return StreamingResponse(
io.BytesIO(original_content),
media_type="application/octet-stream",
headers={
"Content-Disposition": f'attachment; filename="{filename}"',
"X-DSMS-Document-Type": metadata.get("document_type", "unknown"),
"X-DSMS-Checksum": metadata.get("checksum", ""),
"X-DSMS-Created-At": metadata.get("created_at", "")
}
)
except json.JSONDecodeError:
# Wenn es kein DSMS-Paket ist, gib rohen Inhalt zurück
return StreamingResponse(
io.BytesIO(content),
media_type="application/octet-stream"
)
@router.get("/api/v1/documents/{cid}/metadata")
async def get_document_metadata(
cid: str,
_auth: dict = Depends(verify_token)
):
"""
Ruft nur die Metadaten eines Dokuments ab.
- **cid**: Content Identifier (IPFS Hash)
"""
content = await ipfs_cat(cid)
try:
package = json.loads(content)
return {
"cid": cid,
"metadata": package.get("metadata", {}),
"filename": package.get("filename"),
"size": len(bytes.fromhex(package.get("content_base64", "")))
}
except json.JSONDecodeError:
return {
"cid": cid,
"metadata": {},
"raw_size": len(content)
}
@router.get("/api/v1/documents", response_model=DocumentList)
async def list_documents(
_auth: dict = Depends(verify_token)
):
"""
Listet alle gespeicherten Dokumente auf.
"""
cids = await ipfs_pin_ls()
documents = []
for cid in cids[:100]: # Limit auf 100 für Performance
try:
content = await ipfs_cat(cid)
package = json.loads(content)
documents.append({
"cid": cid,
"metadata": package.get("metadata", {}),
"filename": package.get("filename")
})
except Exception:
# Überspringe nicht-DSMS Objekte
continue
return DocumentList(
documents=documents,
total=len(documents)
)
@router.delete("/api/v1/documents/{cid}")
async def unpin_document(
cid: str,
_auth: dict = Depends(verify_token)
):
"""
Entfernt ein Dokument aus dem lokalen Pin-Set.
Das Dokument bleibt im Netzwerk, wird aber bei GC entfernt.
- **cid**: Content Identifier (IPFS Hash)
"""
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{IPFS_API_URL}/api/v0/pin/rm",
params={"arg": cid}
)
if response.status_code != 200:
raise HTTPException(
status_code=404,
detail=f"Konnte Pin nicht entfernen: {cid}"
)
return {
"status": "unpinned",
"cid": cid,
"message": "Dokument wird bei nächster Garbage Collection entfernt"
}
@router.post("/api/v1/legal-documents/archive")
async def archive_legal_document(
document_id: str,
version: str,
content: str,
language: str = "de",
_auth: dict = Depends(verify_token)
):
"""
Archiviert eine rechtliche Dokumentversion dauerhaft.
Speziell für AGB, Datenschutzerklärung, etc.
- **document_id**: ID des Legal Documents
- **version**: Versionsnummer
- **content**: HTML/Markdown Inhalt
- **language**: Sprache
"""
# Checksum berechnen
content_bytes = content.encode('utf-8')
checksum = hashlib.sha256(content_bytes).hexdigest()
# Metadaten
metadata = {
"document_type": "legal_document",
"document_id": document_id,
"version": version,
"language": language,
"created_at": datetime.utcnow().isoformat(),
"checksum": checksum,
"content_type": "text/html"
}
# Paket erstellen
package = {
"metadata": metadata,
"content": content,
"archived_at": datetime.utcnow().isoformat()
}
package_bytes = json.dumps(package, ensure_ascii=False).encode('utf-8')
# Zu IPFS hinzufügen
result = await ipfs_add(package_bytes)
cid = result.get("Hash")
return {
"cid": cid,
"document_id": document_id,
"version": version,
"checksum": checksum,
"archived_at": datetime.utcnow().isoformat(),
"verification_url": f"{IPFS_GATEWAY_URL}/ipfs/{cid}"
}
@router.get("/api/v1/documents/{cid}/history")
@router.get("/documents/{cid}/history") # legacy path, kept for backwards compatibility
async def get_document_history(cid: str):
"""Follow the parent_cid chain to reconstruct version history."""
history = []
current_cid = cid
max_depth = 50 # prevent infinite loops
for _ in range(max_depth):
try:
raw = await ipfs_cat(current_cid)
package = json.loads(raw)
metadata = package.get("metadata", {})
history.append({
"cid": current_cid,
"version": metadata.get("version"),
"document_type": metadata.get("document_type"),
"document_id": metadata.get("document_id"),
"parent_cid": metadata.get("parent_cid"),
"created_at": metadata.get("created_at"),
"checksum": metadata.get("checksum"),
})
parent = metadata.get("parent_cid")
if not parent:
break
current_cid = parent
except Exception:
break
return {"cid": cid, "history": history, "depth": len(history)}
@router.get("/api/v1/documents/{cid_a}/diff/{cid_b}")
async def diff_documents(cid_a: str, cid_b: str):
"""
Compare two DSMS document versions by their CIDs.
Returns a unified diff of the textual content when both documents are
text-decodable (UTF-8). For binary documents the response indicates
"binary" and returns just the metadata differences. Used by the Audit
Timeline UI to render "what changed between V2 and V3 of CE-Akte X".
"""
try:
raw_a = await ipfs_cat(cid_a)
raw_b = await ipfs_cat(cid_b)
except Exception as exc:
return {"error": f"could not fetch one of the CIDs: {exc}", "cid_a": cid_a, "cid_b": cid_b}
try:
pkg_a = json.loads(raw_a)
pkg_b = json.loads(raw_b)
except Exception:
# Documents are not the wrapped-package JSON shape — treat as raw.
pkg_a = {"metadata": {}, "content_base64": ""}
pkg_b = {"metadata": {}, "content_base64": ""}
meta_a = pkg_a.get("metadata", {}) or {}
meta_b = pkg_b.get("metadata", {}) or {}
meta_diff = _diff_metadata(meta_a, meta_b)
# Try to decode the content. The Archive flow stores files as base64 in
# `content_base64`; older payloads may use `content` (utf-8 text).
text_a, text_b, is_binary = _extract_texts(pkg_a, pkg_b)
if is_binary:
return {
"cid_a": cid_a,
"cid_b": cid_b,
"kind": "binary",
"metadata_diff": meta_diff,
"note": "Binary payload — text diff omitted. Compare via the rendered tech-file export instead.",
}
diff_lines = list(
_unified_diff(text_a.splitlines(), text_b.splitlines(), fromfile=cid_a, tofile=cid_b, lineterm="")
)
return {
"cid_a": cid_a,
"cid_b": cid_b,
"kind": "text",
"metadata_diff": meta_diff,
"diff": "\n".join(diff_lines),
"added_lines": sum(1 for ln in diff_lines if ln.startswith("+") and not ln.startswith("+++")),
"removed_lines": sum(1 for ln in diff_lines if ln.startswith("-") and not ln.startswith("---")),
}
@router.get("/api/v1/documents/{cid}/bulk-diff")
async def bulk_diff_chain(cid: str):
"""
Aggregate diff across the entire parent_cid chain (V1 → V_latest).
Walks the history chain once, then computes per-step diffs between every
chronological pair plus running totals. Designed for the "Bulk-Diff
Report" panel in the IACE audit timeline so the user can see how a
tech-file evolved across all versions without clicking each pair.
"""
history: list[dict] = []
current_cid: Optional[str] = cid
max_depth = 50
for _ in range(max_depth):
if current_cid is None:
break
try:
raw = await ipfs_cat(current_cid)
package = json.loads(raw)
except Exception:
break
metadata = package.get("metadata", {}) or {}
history.append({
"cid": current_cid,
"version": metadata.get("version"),
"created_at": metadata.get("created_at"),
"metadata": metadata,
"package": package,
})
parent = metadata.get("parent_cid")
if not parent or parent == current_cid:
break
current_cid = parent
if len(history) < 2:
return {
"cid_latest": cid,
"cid_baseline": cid,
"versions": len(history),
"steps": [],
"totals": {"added_lines": 0, "removed_lines": 0, "metadata_fields_changed": 0, "binary_steps": 0},
"note": "No predecessor versions found." if history else "CID not found.",
}
# history is newest→oldest; reverse to walk chronologically.
chronological = list(reversed(history))
steps: list[dict] = []
total_added = 0
total_removed = 0
binary_steps = 0
fields_changed: set[str] = set()
for i in range(len(chronological) - 1):
older = chronological[i]
newer = chronological[i + 1]
meta_diff = _diff_metadata(older["metadata"], newer["metadata"])
text_a, text_b, is_binary = _extract_texts(older["package"], newer["package"])
step: dict = {
"from": older["cid"],
"from_version": older["version"],
"to": newer["cid"],
"to_version": newer["version"],
"created_at": newer["created_at"],
"metadata_diff_fields": sorted(meta_diff.keys()),
}
if is_binary:
step["kind"] = "binary"
step["added_lines"] = 0
step["removed_lines"] = 0
binary_steps += 1
else:
diff_lines = list(
_unified_diff(text_a.splitlines(), text_b.splitlines(), fromfile=older["cid"], tofile=newer["cid"], lineterm="")
)
added = sum(1 for ln in diff_lines if ln.startswith("+") and not ln.startswith("+++"))
removed = sum(1 for ln in diff_lines if ln.startswith("-") and not ln.startswith("---"))
step["kind"] = "text"
step["added_lines"] = added
step["removed_lines"] = removed
total_added += added
total_removed += removed
fields_changed.update(meta_diff.keys())
steps.append(step)
return {
"cid_latest": cid,
"cid_baseline": chronological[0]["cid"],
"versions": len(history),
"steps": steps,
"totals": {
"added_lines": total_added,
"removed_lines": total_removed,
"metadata_fields_changed": len(fields_changed),
"binary_steps": binary_steps,
},
}
def _diff_metadata(a: dict, b: dict) -> dict:
"""Return per-field change list: {field: {"old": ..., "new": ...}}."""
keys = set(a.keys()) | set(b.keys())
changes = {}
for k in sorted(keys):
if a.get(k) != b.get(k):
changes[k] = {"old": a.get(k), "new": b.get(k)}
return changes
def _extract_texts(pkg_a: dict, pkg_b: dict) -> tuple[str, str, bool]:
"""Return (text_a, text_b, is_binary). Falls back to base64-decode."""
import base64
def to_text(pkg: dict) -> tuple[str, bool]:
if isinstance(pkg.get("content"), str):
return pkg["content"], False
b64 = pkg.get("content_base64")
if not b64:
return "", False
try:
raw = base64.b64decode(b64)
except Exception:
return "", True
try:
return raw.decode("utf-8"), False
except UnicodeDecodeError:
return "", True
text_a, bin_a = to_text(pkg_a)
text_b, bin_b = to_text(pkg_b)
return text_a, text_b, (bin_a or bin_b)
def _unified_diff(a, b, fromfile, tofile, lineterm):
"""Tiny shim around difflib.unified_diff so the function reads cleanly."""
import difflib
return difflib.unified_diff(a, b, fromfile=fromfile, tofile=tofile, lineterm=lineterm, n=2)