"""Corpus + license overview — which source documents are in the corpus and under which license / usage rights. Read-only; backs the admin coverage page so the team can SEE every use-case and every ingested document with its license (and not forget any). See use_case_controls for the per-topic retrieval. """ from __future__ import annotations from typing import Any, Optional from sqlalchemy import text from sqlalchemy.orm import Session from compliance.data.use_case_registry import use_case_for_regulation # canonical_controls.license_rule is a coarse 3-tier flag (the detailed terms # live in canonical_control_licenses, keyed per curated source). _TIER: dict[int, str] = { 1: "Öffentlich / frei nutzbar (Public Domain, EU-Recht)", 2: "Offen mit Attribution (CC-BY / CC-BY-SA)", 3: "Eigenformulierung / eingeschränkt", } _LIVE = ("cc.decomposition_method = 'pass0b' " "AND cc.release_state NOT IN ('deprecated', 'duplicate', 'rejected')") def license_tier_label(rule: Optional[int]) -> str: """Human label for the coarse license_rule tier. Pure → unit-testable.""" return _TIER.get(rule or 0, "unbekannt") def corpus_overview(db: Session) -> dict[str, Any]: """Three views for the coverage page: (1) atom counts per license tier, (2) every source document (source_regulation) with tier + count + mapped use-case, (3) the curated license catalog with detailed usage rights.""" summary = [ { "license_rule": int(r[0]) if r[0] is not None else None, "label": license_tier_label(r[0]), "atom_count": int(r[1]), } for r in db.execute(text( f"SELECT cc.license_rule, count(*) FROM canonical_controls cc " f"WHERE {_LIVE} GROUP BY cc.license_rule ORDER BY cc.license_rule" )).fetchall() ] documents = [ { "source_regulation": r.src, "license_rule": int(r.lic) if r.lic is not None else None, "license_tier": license_tier_label(r.lic), "atom_count": int(r.n), "use_case": use_case_for_regulation(r.src), } for r in db.execute(text( f"SELECT cpl.source_regulation AS src, max(cc.license_rule) AS lic, " f"count(DISTINCT cc.id) AS n FROM canonical_controls cc " f"JOIN control_parent_links cpl ON cpl.control_uuid = cc.id " f"WHERE {_LIVE} AND coalesce(cpl.source_regulation, '') <> '' " f"GROUP BY cpl.source_regulation ORDER BY n DESC" )).fetchall() ] catalog: list[dict[str, Any]] = [] if db.execute(text( "SELECT to_regclass('compliance.canonical_control_sources')" )).scalar() is not None: catalog = [ { "source_id": r.source_id, "title": r.title, "publisher": r.publisher, "url": r.url, "version": r.version_label, "license_id": r.license_id, "license_name": r.license_name, "commercial_use": r.commercial_use, "ship_in_product": r.allowed_ship_in_product, "terms_url": r.terms_url, } for r in db.execute(text( "SELECT s.source_id, s.title, s.publisher, s.url, s.version_label, " "s.license_id, s.allowed_ship_in_product, l.name AS license_name, " "l.commercial_use, l.terms_url " "FROM canonical_control_sources s " "LEFT JOIN canonical_control_licenses l ON l.license_id = s.license_id " "ORDER BY s.publisher NULLS LAST, s.title" )).fetchall() ] return { "license_summary": summary, "documents": documents, "license_catalog": catalog, "totals": {"documents": len(documents), "catalog_sources": len(catalog)}, }