216c7b8eca
CI / detect-changes (push) Successful in 8s
CI / branch-name (push) Has been skipped
CI / build-sha-integrity (push) Failing after 4s
CI / validate-canonical-controls (push) Successful in 10s
CI / loc-budget (push) Successful in 14s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m21s
CI / test-go (push) Failing after 37s
CI / iace-gt-coverage (push) Successful in 23s
CI / test-python-backend (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Successful in 17s
Punkt 1 — UI-CID-Badge nach erfolgreichem Tech-File-Export:
- archiveTechFile setzt X-DSMS-CID / X-DSMS-Filename / X-DSMS-Size response
headers + Access-Control-Expose-Headers, sobald DSMS-Archive durchlief
- Split iace_handler_techfile.go (war ueber 500 LOC) → archiveTechFile lebt
jetzt in iace_handler_techfile_archive.go, setDSMSResponseHeaders als
pure Helper mit 3 unit tests
- Next.js IACE-Proxy forwarded die X-DSMS-* Header und erkennt jetzt auch
XLSX/DOCX/MD als Binary-Response (vorher nur PDF/ZIP/octet-stream)
- ExportCIDBadge.tsx zeigt CID, Filename, Groesse + Kopieren-Button +
"Verlauf anzeigen" (oeffnet CIDHistoryModal)
Punkt 2 — Bulk-Diff Report V1 → V_latest:
- Neuer Endpoint GET /api/v1/documents/{cid}/bulk-diff im dsms-gateway:
laeuft parent_cid-Kette ab, berechnet chronologische Step-Diffs,
aggregiert Totals (added/removed lines, metadata_fields_changed,
binary_steps). Edge-Cases: einzelne Version, binaere Steps, abgebrochene
Kette
- BulkDiffPanel.tsx zeigt 4-Stat-Header + Step-Tabelle
- CIDHistoryModal bekommt Toggle-Button "Bulk-Diff V1 → V_latest anzeigen"
neben dem Versions-Counter; damit auch vom IACE-Export-Badge erreichbar
Tests: 3 neue Go-Tests, 4 neue pytest-Tests, alle gruen
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
487 lines
15 KiB
Python
487 lines
15 KiB
Python
"""
|
|
Documents router — handles /api/v1/documents and /api/v1/legal-documents endpoints.
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import io
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
|
|
from fastapi.responses import StreamingResponse
|
|
|
|
from models import DocumentList, DocumentMetadata, StoredDocument
|
|
from dependencies import verify_token, ipfs_add, ipfs_cat, ipfs_pin_ls
|
|
from config import IPFS_API_URL, IPFS_GATEWAY_URL
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.post("/api/v1/documents", response_model=StoredDocument)
|
|
async def store_document(
|
|
file: UploadFile = File(...),
|
|
document_type: str = "legal_document",
|
|
document_id: Optional[str] = None,
|
|
version: Optional[str] = None,
|
|
language: str = "de",
|
|
_auth: dict = Depends(verify_token)
|
|
):
|
|
"""
|
|
Speichert ein Dokument im DSMS.
|
|
|
|
- **file**: Das zu speichernde Dokument
|
|
- **document_type**: Typ des Dokuments (legal_document, consent_record, audit_log)
|
|
- **document_id**: Optionale ID des Dokuments
|
|
- **version**: Optionale Versionsnummer
|
|
- **language**: Sprache (default: de)
|
|
"""
|
|
content = await file.read()
|
|
|
|
# Checksum berechnen
|
|
checksum = hashlib.sha256(content).hexdigest()
|
|
|
|
# Metadaten erstellen
|
|
metadata = DocumentMetadata(
|
|
document_type=document_type,
|
|
document_id=document_id,
|
|
version=version,
|
|
language=language,
|
|
created_at=datetime.utcnow().isoformat(),
|
|
checksum=checksum,
|
|
encrypted=False
|
|
)
|
|
|
|
# Dokument mit Metadaten als JSON verpacken
|
|
package = {
|
|
"metadata": metadata.model_dump(),
|
|
"content_base64": content.hex(), # Hex-encodiert für JSON
|
|
"filename": file.filename
|
|
}
|
|
|
|
package_bytes = json.dumps(package).encode()
|
|
|
|
# Zu IPFS hinzufügen
|
|
result = await ipfs_add(package_bytes)
|
|
|
|
cid = result.get("Hash")
|
|
size = int(result.get("Size", 0))
|
|
|
|
return StoredDocument(
|
|
cid=cid,
|
|
size=size,
|
|
metadata=metadata,
|
|
gateway_url=f"{IPFS_GATEWAY_URL}/ipfs/{cid}",
|
|
timestamp=datetime.utcnow().isoformat()
|
|
)
|
|
|
|
|
|
@router.get("/api/v1/documents/{cid}")
|
|
async def get_document(
|
|
cid: str,
|
|
_auth: dict = Depends(verify_token)
|
|
):
|
|
"""
|
|
Ruft ein Dokument aus dem DSMS ab.
|
|
|
|
- **cid**: Content Identifier (IPFS Hash)
|
|
"""
|
|
content = await ipfs_cat(cid)
|
|
|
|
try:
|
|
package = json.loads(content)
|
|
metadata = package.get("metadata", {})
|
|
original_content = bytes.fromhex(package.get("content_base64", ""))
|
|
filename = package.get("filename", "document")
|
|
|
|
return StreamingResponse(
|
|
io.BytesIO(original_content),
|
|
media_type="application/octet-stream",
|
|
headers={
|
|
"Content-Disposition": f'attachment; filename="{filename}"',
|
|
"X-DSMS-Document-Type": metadata.get("document_type", "unknown"),
|
|
"X-DSMS-Checksum": metadata.get("checksum", ""),
|
|
"X-DSMS-Created-At": metadata.get("created_at", "")
|
|
}
|
|
)
|
|
except json.JSONDecodeError:
|
|
# Wenn es kein DSMS-Paket ist, gib rohen Inhalt zurück
|
|
return StreamingResponse(
|
|
io.BytesIO(content),
|
|
media_type="application/octet-stream"
|
|
)
|
|
|
|
|
|
@router.get("/api/v1/documents/{cid}/metadata")
|
|
async def get_document_metadata(
|
|
cid: str,
|
|
_auth: dict = Depends(verify_token)
|
|
):
|
|
"""
|
|
Ruft nur die Metadaten eines Dokuments ab.
|
|
|
|
- **cid**: Content Identifier (IPFS Hash)
|
|
"""
|
|
content = await ipfs_cat(cid)
|
|
|
|
try:
|
|
package = json.loads(content)
|
|
return {
|
|
"cid": cid,
|
|
"metadata": package.get("metadata", {}),
|
|
"filename": package.get("filename"),
|
|
"size": len(bytes.fromhex(package.get("content_base64", "")))
|
|
}
|
|
except json.JSONDecodeError:
|
|
return {
|
|
"cid": cid,
|
|
"metadata": {},
|
|
"raw_size": len(content)
|
|
}
|
|
|
|
|
|
@router.get("/api/v1/documents", response_model=DocumentList)
|
|
async def list_documents(
|
|
_auth: dict = Depends(verify_token)
|
|
):
|
|
"""
|
|
Listet alle gespeicherten Dokumente auf.
|
|
"""
|
|
cids = await ipfs_pin_ls()
|
|
|
|
documents = []
|
|
for cid in cids[:100]: # Limit auf 100 für Performance
|
|
try:
|
|
content = await ipfs_cat(cid)
|
|
package = json.loads(content)
|
|
documents.append({
|
|
"cid": cid,
|
|
"metadata": package.get("metadata", {}),
|
|
"filename": package.get("filename")
|
|
})
|
|
except Exception:
|
|
# Überspringe nicht-DSMS Objekte
|
|
continue
|
|
|
|
return DocumentList(
|
|
documents=documents,
|
|
total=len(documents)
|
|
)
|
|
|
|
|
|
@router.delete("/api/v1/documents/{cid}")
|
|
async def unpin_document(
|
|
cid: str,
|
|
_auth: dict = Depends(verify_token)
|
|
):
|
|
"""
|
|
Entfernt ein Dokument aus dem lokalen Pin-Set.
|
|
Das Dokument bleibt im Netzwerk, wird aber bei GC entfernt.
|
|
|
|
- **cid**: Content Identifier (IPFS Hash)
|
|
"""
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
response = await client.post(
|
|
f"{IPFS_API_URL}/api/v0/pin/rm",
|
|
params={"arg": cid}
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"Konnte Pin nicht entfernen: {cid}"
|
|
)
|
|
|
|
return {
|
|
"status": "unpinned",
|
|
"cid": cid,
|
|
"message": "Dokument wird bei nächster Garbage Collection entfernt"
|
|
}
|
|
|
|
|
|
@router.post("/api/v1/legal-documents/archive")
|
|
async def archive_legal_document(
|
|
document_id: str,
|
|
version: str,
|
|
content: str,
|
|
language: str = "de",
|
|
_auth: dict = Depends(verify_token)
|
|
):
|
|
"""
|
|
Archiviert eine rechtliche Dokumentversion dauerhaft.
|
|
Speziell für AGB, Datenschutzerklärung, etc.
|
|
|
|
- **document_id**: ID des Legal Documents
|
|
- **version**: Versionsnummer
|
|
- **content**: HTML/Markdown Inhalt
|
|
- **language**: Sprache
|
|
"""
|
|
# Checksum berechnen
|
|
content_bytes = content.encode('utf-8')
|
|
checksum = hashlib.sha256(content_bytes).hexdigest()
|
|
|
|
# Metadaten
|
|
metadata = {
|
|
"document_type": "legal_document",
|
|
"document_id": document_id,
|
|
"version": version,
|
|
"language": language,
|
|
"created_at": datetime.utcnow().isoformat(),
|
|
"checksum": checksum,
|
|
"content_type": "text/html"
|
|
}
|
|
|
|
# Paket erstellen
|
|
package = {
|
|
"metadata": metadata,
|
|
"content": content,
|
|
"archived_at": datetime.utcnow().isoformat()
|
|
}
|
|
|
|
package_bytes = json.dumps(package, ensure_ascii=False).encode('utf-8')
|
|
|
|
# Zu IPFS hinzufügen
|
|
result = await ipfs_add(package_bytes)
|
|
|
|
cid = result.get("Hash")
|
|
|
|
return {
|
|
"cid": cid,
|
|
"document_id": document_id,
|
|
"version": version,
|
|
"checksum": checksum,
|
|
"archived_at": datetime.utcnow().isoformat(),
|
|
"verification_url": f"{IPFS_GATEWAY_URL}/ipfs/{cid}"
|
|
}
|
|
|
|
|
|
@router.get("/api/v1/documents/{cid}/history")
|
|
@router.get("/documents/{cid}/history") # legacy path, kept for backwards compatibility
|
|
async def get_document_history(cid: str):
|
|
"""Follow the parent_cid chain to reconstruct version history."""
|
|
history = []
|
|
current_cid = cid
|
|
max_depth = 50 # prevent infinite loops
|
|
|
|
for _ in range(max_depth):
|
|
try:
|
|
raw = await ipfs_cat(current_cid)
|
|
package = json.loads(raw)
|
|
metadata = package.get("metadata", {})
|
|
history.append({
|
|
"cid": current_cid,
|
|
"version": metadata.get("version"),
|
|
"document_type": metadata.get("document_type"),
|
|
"document_id": metadata.get("document_id"),
|
|
"parent_cid": metadata.get("parent_cid"),
|
|
"created_at": metadata.get("created_at"),
|
|
"checksum": metadata.get("checksum"),
|
|
})
|
|
parent = metadata.get("parent_cid")
|
|
if not parent:
|
|
break
|
|
current_cid = parent
|
|
except Exception:
|
|
break
|
|
|
|
return {"cid": cid, "history": history, "depth": len(history)}
|
|
|
|
|
|
@router.get("/api/v1/documents/{cid_a}/diff/{cid_b}")
|
|
async def diff_documents(cid_a: str, cid_b: str):
|
|
"""
|
|
Compare two DSMS document versions by their CIDs.
|
|
|
|
Returns a unified diff of the textual content when both documents are
|
|
text-decodable (UTF-8). For binary documents the response indicates
|
|
"binary" and returns just the metadata differences. Used by the Audit
|
|
Timeline UI to render "what changed between V2 and V3 of CE-Akte X".
|
|
"""
|
|
try:
|
|
raw_a = await ipfs_cat(cid_a)
|
|
raw_b = await ipfs_cat(cid_b)
|
|
except Exception as exc:
|
|
return {"error": f"could not fetch one of the CIDs: {exc}", "cid_a": cid_a, "cid_b": cid_b}
|
|
|
|
try:
|
|
pkg_a = json.loads(raw_a)
|
|
pkg_b = json.loads(raw_b)
|
|
except Exception:
|
|
# Documents are not the wrapped-package JSON shape — treat as raw.
|
|
pkg_a = {"metadata": {}, "content_base64": ""}
|
|
pkg_b = {"metadata": {}, "content_base64": ""}
|
|
|
|
meta_a = pkg_a.get("metadata", {}) or {}
|
|
meta_b = pkg_b.get("metadata", {}) or {}
|
|
meta_diff = _diff_metadata(meta_a, meta_b)
|
|
|
|
# Try to decode the content. The Archive flow stores files as base64 in
|
|
# `content_base64`; older payloads may use `content` (utf-8 text).
|
|
text_a, text_b, is_binary = _extract_texts(pkg_a, pkg_b)
|
|
|
|
if is_binary:
|
|
return {
|
|
"cid_a": cid_a,
|
|
"cid_b": cid_b,
|
|
"kind": "binary",
|
|
"metadata_diff": meta_diff,
|
|
"note": "Binary payload — text diff omitted. Compare via the rendered tech-file export instead.",
|
|
}
|
|
|
|
diff_lines = list(
|
|
_unified_diff(text_a.splitlines(), text_b.splitlines(), fromfile=cid_a, tofile=cid_b, lineterm="")
|
|
)
|
|
return {
|
|
"cid_a": cid_a,
|
|
"cid_b": cid_b,
|
|
"kind": "text",
|
|
"metadata_diff": meta_diff,
|
|
"diff": "\n".join(diff_lines),
|
|
"added_lines": sum(1 for ln in diff_lines if ln.startswith("+") and not ln.startswith("+++")),
|
|
"removed_lines": sum(1 for ln in diff_lines if ln.startswith("-") and not ln.startswith("---")),
|
|
}
|
|
|
|
|
|
@router.get("/api/v1/documents/{cid}/bulk-diff")
|
|
async def bulk_diff_chain(cid: str):
|
|
"""
|
|
Aggregate diff across the entire parent_cid chain (V1 → V_latest).
|
|
|
|
Walks the history chain once, then computes per-step diffs between every
|
|
chronological pair plus running totals. Designed for the "Bulk-Diff
|
|
Report" panel in the IACE audit timeline so the user can see how a
|
|
tech-file evolved across all versions without clicking each pair.
|
|
"""
|
|
history: list[dict] = []
|
|
current_cid: Optional[str] = cid
|
|
max_depth = 50
|
|
|
|
for _ in range(max_depth):
|
|
if current_cid is None:
|
|
break
|
|
try:
|
|
raw = await ipfs_cat(current_cid)
|
|
package = json.loads(raw)
|
|
except Exception:
|
|
break
|
|
metadata = package.get("metadata", {}) or {}
|
|
history.append({
|
|
"cid": current_cid,
|
|
"version": metadata.get("version"),
|
|
"created_at": metadata.get("created_at"),
|
|
"metadata": metadata,
|
|
"package": package,
|
|
})
|
|
parent = metadata.get("parent_cid")
|
|
if not parent or parent == current_cid:
|
|
break
|
|
current_cid = parent
|
|
|
|
if len(history) < 2:
|
|
return {
|
|
"cid_latest": cid,
|
|
"cid_baseline": cid,
|
|
"versions": len(history),
|
|
"steps": [],
|
|
"totals": {"added_lines": 0, "removed_lines": 0, "metadata_fields_changed": 0, "binary_steps": 0},
|
|
"note": "No predecessor versions found." if history else "CID not found.",
|
|
}
|
|
|
|
# history is newest→oldest; reverse to walk chronologically.
|
|
chronological = list(reversed(history))
|
|
steps: list[dict] = []
|
|
total_added = 0
|
|
total_removed = 0
|
|
binary_steps = 0
|
|
fields_changed: set[str] = set()
|
|
|
|
for i in range(len(chronological) - 1):
|
|
older = chronological[i]
|
|
newer = chronological[i + 1]
|
|
meta_diff = _diff_metadata(older["metadata"], newer["metadata"])
|
|
text_a, text_b, is_binary = _extract_texts(older["package"], newer["package"])
|
|
|
|
step: dict = {
|
|
"from": older["cid"],
|
|
"from_version": older["version"],
|
|
"to": newer["cid"],
|
|
"to_version": newer["version"],
|
|
"created_at": newer["created_at"],
|
|
"metadata_diff_fields": sorted(meta_diff.keys()),
|
|
}
|
|
|
|
if is_binary:
|
|
step["kind"] = "binary"
|
|
step["added_lines"] = 0
|
|
step["removed_lines"] = 0
|
|
binary_steps += 1
|
|
else:
|
|
diff_lines = list(
|
|
_unified_diff(text_a.splitlines(), text_b.splitlines(), fromfile=older["cid"], tofile=newer["cid"], lineterm="")
|
|
)
|
|
added = sum(1 for ln in diff_lines if ln.startswith("+") and not ln.startswith("+++"))
|
|
removed = sum(1 for ln in diff_lines if ln.startswith("-") and not ln.startswith("---"))
|
|
step["kind"] = "text"
|
|
step["added_lines"] = added
|
|
step["removed_lines"] = removed
|
|
total_added += added
|
|
total_removed += removed
|
|
|
|
fields_changed.update(meta_diff.keys())
|
|
steps.append(step)
|
|
|
|
return {
|
|
"cid_latest": cid,
|
|
"cid_baseline": chronological[0]["cid"],
|
|
"versions": len(history),
|
|
"steps": steps,
|
|
"totals": {
|
|
"added_lines": total_added,
|
|
"removed_lines": total_removed,
|
|
"metadata_fields_changed": len(fields_changed),
|
|
"binary_steps": binary_steps,
|
|
},
|
|
}
|
|
|
|
|
|
def _diff_metadata(a: dict, b: dict) -> dict:
|
|
"""Return per-field change list: {field: {"old": ..., "new": ...}}."""
|
|
keys = set(a.keys()) | set(b.keys())
|
|
changes = {}
|
|
for k in sorted(keys):
|
|
if a.get(k) != b.get(k):
|
|
changes[k] = {"old": a.get(k), "new": b.get(k)}
|
|
return changes
|
|
|
|
|
|
def _extract_texts(pkg_a: dict, pkg_b: dict) -> tuple[str, str, bool]:
|
|
"""Return (text_a, text_b, is_binary). Falls back to base64-decode."""
|
|
import base64
|
|
|
|
def to_text(pkg: dict) -> tuple[str, bool]:
|
|
if isinstance(pkg.get("content"), str):
|
|
return pkg["content"], False
|
|
b64 = pkg.get("content_base64")
|
|
if not b64:
|
|
return "", False
|
|
try:
|
|
raw = base64.b64decode(b64)
|
|
except Exception:
|
|
return "", True
|
|
try:
|
|
return raw.decode("utf-8"), False
|
|
except UnicodeDecodeError:
|
|
return "", True
|
|
|
|
text_a, bin_a = to_text(pkg_a)
|
|
text_b, bin_b = to_text(pkg_b)
|
|
return text_a, text_b, (bin_a or bin_b)
|
|
|
|
|
|
def _unified_diff(a, b, fromfile, tofile, lineterm):
|
|
"""Tiny shim around difflib.unified_diff so the function reads cleanly."""
|
|
import difflib
|
|
|
|
return difflib.unified_diff(a, b, fromfile=fromfile, tofile=tofile, lineterm=lineterm, n=2)
|