feat: add RAG corpus versioning and source policy backend
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 34s
CI / test-python-backend-compliance (push) Successful in 32s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 18s

Part 1 — RAG Corpus Versioning:
- New DB table compliance_corpus_versions (migration 017)
- Go CorpusVersionStore with CRUD operations
- Assessment struct extended with corpus_version_id
- API endpoints: GET /rag/corpus-status, /rag/corpus-versions/:collection
- RAG routes (search, regulations) now registered in main.go
- Ingestion script registers corpus versions after each run
- Frontend staleness badge in SDK sidebar

Part 3 — Source Policy Backend:
- New FastAPI router with CRUD for allowed sources, PII rules,
  operations matrix, audit trail, stats, and compliance report
- SQLAlchemy models for all source policy tables (migration 001)
- Frontend API base corrected from edu-search:8088/8089 to
  backend-compliance:8002/api

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-02 07:58:08 +01:00
parent 187dbf1b77
commit a228b3b528
15 changed files with 2020 additions and 11 deletions

View File

@@ -3,8 +3,8 @@
/**
* Source Policy Management Page (SDK Version)
*
* Whitelist-based data source management for edu-search-service.
* For auditors: Full audit trail for all changes.
* Whitelist-based data source management for compliance RAG corpus.
* Controls which legal sources may be used, PII rules, and audit trail.
*/
import { useState, useEffect } from 'react'
@@ -15,14 +15,14 @@ import { OperationsMatrixTab } from '@/components/sdk/source-policy/OperationsMa
import { PIIRulesTab } from '@/components/sdk/source-policy/PIIRulesTab'
import { AuditTab } from '@/components/sdk/source-policy/AuditTab'
// API base URL for edu-search-service
// API base URL for backend-compliance
const getApiBase = () => {
if (typeof window === 'undefined') return 'http://localhost:8088'
if (typeof window === 'undefined') return 'http://localhost:8002/api'
const hostname = window.location.hostname
if (hostname === 'localhost' || hostname === '127.0.0.1') {
return 'http://localhost:8088'
return 'http://localhost:8002/api'
}
return `https://${hostname}:8089`
return `https://${hostname}:8002/api`
}
interface PolicyStats {

View File

@@ -10,6 +10,7 @@ import {
getStepsForPackage,
type SDKPackageId,
type SDKStep,
type RAGCorpusStatus,
} from '@/lib/sdk'
// =============================================================================
@@ -288,6 +289,41 @@ interface SDKSidebarProps {
onCollapsedChange?: (collapsed: boolean) => void
}
// =============================================================================
// CORPUS STALENESS INFO
// =============================================================================
function CorpusStalenessInfo({ ragCorpusStatus }: { ragCorpusStatus: RAGCorpusStatus }) {
const collections = ragCorpusStatus.collections
const collectionNames = Object.keys(collections)
if (collectionNames.length === 0) return null
// Check if corpus was updated after the last fetch (simplified: show last update time)
const lastUpdated = collectionNames.reduce((latest, name) => {
const updated = new Date(collections[name].last_updated)
return updated > latest ? updated : latest
}, new Date(0))
const daysSinceUpdate = Math.floor((Date.now() - lastUpdated.getTime()) / (1000 * 60 * 60 * 24))
const totalChunks = collectionNames.reduce((sum, name) => sum + collections[name].chunks_count, 0)
return (
<div className="px-4 py-2 border-b border-gray-100">
<div className="flex items-center gap-2 text-xs">
<div className={`w-2 h-2 rounded-full flex-shrink-0 ${daysSinceUpdate > 30 ? 'bg-amber-400' : 'bg-green-400'}`} />
<span className="text-gray-500 truncate">
RAG Corpus: {totalChunks} Chunks
</span>
</div>
{daysSinceUpdate > 30 && (
<div className="mt-1 text-xs text-amber-600 bg-amber-50 rounded px-2 py-1">
Corpus {daysSinceUpdate}d alt Re-Evaluation empfohlen
</div>
)}
</div>
)
}
export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarProps) {
const pathname = usePathname()
const { state, packageCompletion, completionPercentage, getCheckpointStatus } = useSDK()
@@ -391,6 +427,11 @@ export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarP
</div>
)}
{/* RAG Corpus Staleness Badge */}
{!collapsed && state.ragCorpusStatus && (
<CorpusStalenessInfo ragCorpusStatus={state.ragCorpusStatus} />
)}
{/* Navigation - 5 Packages */}
<nav className="flex-1 overflow-y-auto">
{SDK_PACKAGES.map(pkg => {
@@ -510,6 +551,18 @@ export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarP
isActive={pathname === '/sdk/dsms'}
collapsed={collapsed}
/>
<AdditionalModuleItem
href="/development/sdk-flow"
icon={
<svg className="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2}
d="M13 10V3L4 14h7v7l9-11h-7z" />
</svg>
}
label="SDK Flow"
isActive={pathname === '/development/sdk-flow'}
collapsed={collapsed}
/>
{state.companyProfile?.machineBuilder?.ceMarkingRequired && (
<AdditionalModuleItem
href="/sdk/iace"

View File

@@ -102,6 +102,9 @@ const initialState: SDKState = {
// IACE (Industrial AI Compliance Engine)
iaceProjects: [],
// RAG Corpus Versioning
ragCorpusStatus: null,
// Security
sbom: null,
securityIssues: [],

View File

@@ -975,6 +975,22 @@ export interface SBOMDependency {
to: string
}
// RAG Corpus Versioning
export interface RAGCorpusCollectionStatus {
id: string
current_version: string
documents_count: number
chunks_count: number
regulations: string[]
last_updated: string
digest: string
}
export interface RAGCorpusStatus {
collections: Record<string, RAGCorpusCollectionStatus>
fetchedAt: string
}
export interface SBOM {
format: 'CycloneDX' | 'SPDX'
version: string
@@ -1504,6 +1520,9 @@ export interface SDKState {
// IACE (Industrial AI Compliance Engine)
iaceProjects: IACEProjectSummary[]
// RAG Corpus Versioning
ragCorpusStatus: RAGCorpusStatus | null
// Security
sbom: SBOM | null
securityIssues: SecurityIssue[]

View File

@@ -62,6 +62,7 @@ func main() {
dsgvoStore := dsgvo.NewStore(pool)
uccaStore := ucca.NewStore(pool)
escalationStore := ucca.NewEscalationStore(pool)
corpusVersionStore := ucca.NewCorpusVersionStore(pool)
roadmapStore := roadmap.NewStore(pool)
workshopStore := workshop.NewStore(pool)
portfolioStore := portfolio.NewStore(pool)
@@ -120,6 +121,7 @@ func main() {
vendorHandlers := handlers.NewVendorHandlers(vendorStore)
iaceHandler := handlers.NewIACEHandler(iaceStore)
trainingHandlers := handlers.NewTrainingHandlers(trainingStore, contentGenerator)
ragHandlers := handlers.NewRAGHandlers(corpusVersionStore)
// Initialize middleware
rbacMiddleware := rbac.NewMiddleware(rbacService, policyEngine)
@@ -345,6 +347,15 @@ func main() {
uccaRoutes.POST("/dsb-pool", escalationHandlers.AddDSBPoolMember)
}
// RAG routes - Legal Corpus Search & Versioning
ragRoutes := v1.Group("/rag")
{
ragRoutes.POST("/search", ragHandlers.Search)
ragRoutes.GET("/regulations", ragHandlers.ListRegulations)
ragRoutes.GET("/corpus-status", ragHandlers.CorpusStatus)
ragRoutes.GET("/corpus-versions/:collection", ragHandlers.CorpusVersionHistory)
}
// Roadmap routes - Compliance Implementation Roadmaps
roadmapRoutes := v1.Group("/roadmaps")
{

View File

@@ -9,13 +9,15 @@ import (
// RAGHandlers handles RAG search API endpoints.
type RAGHandlers struct {
ragClient *ucca.LegalRAGClient
ragClient *ucca.LegalRAGClient
corpusVersionStore *ucca.CorpusVersionStore
}
// NewRAGHandlers creates new RAG handlers.
func NewRAGHandlers() *RAGHandlers {
func NewRAGHandlers(corpusVersionStore *ucca.CorpusVersionStore) *RAGHandlers {
return &RAGHandlers{
ragClient: ucca.NewLegalRAGClient(),
ragClient: ucca.NewLegalRAGClient(),
corpusVersionStore: corpusVersionStore,
}
}
@@ -74,3 +76,62 @@ func (h *RAGHandlers) ListRegulations(c *gin.Context) {
"count": len(regs),
})
}
// CorpusStatus returns the current version status of all RAG collections.
// GET /sdk/v1/rag/corpus-status
func (h *RAGHandlers) CorpusStatus(c *gin.Context) {
if h.corpusVersionStore == nil {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "corpus version store not configured"})
return
}
versions, err := h.corpusVersionStore.GetAllLatestVersions(c.Request.Context())
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch corpus versions: " + err.Error()})
return
}
collections := make(map[string]gin.H)
for _, v := range versions {
collections[v.CollectionName] = gin.H{
"id": v.ID,
"current_version": v.Version,
"documents_count": v.DocumentsCount,
"chunks_count": v.ChunksCount,
"regulations": v.Regulations,
"last_updated": v.CreatedAt,
"digest": v.Digest,
}
}
c.JSON(http.StatusOK, gin.H{
"collections": collections,
})
}
// CorpusVersionHistory returns the version history for a specific collection.
// GET /sdk/v1/rag/corpus-versions/:collection
func (h *RAGHandlers) CorpusVersionHistory(c *gin.Context) {
if h.corpusVersionStore == nil {
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "corpus version store not configured"})
return
}
collection := c.Param("collection")
if collection == "" {
c.JSON(http.StatusBadRequest, gin.H{"error": "collection name required"})
return
}
versions, err := h.corpusVersionStore.ListCorpusVersions(c.Request.Context(), collection)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch corpus versions: " + err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{
"collection": collection,
"versions": versions,
"count": len(versions),
})
}

View File

@@ -0,0 +1,158 @@
package ucca
import (
"context"
"time"
"github.com/google/uuid"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
)
// CorpusVersion tracks a specific version of the RAG compliance corpus.
type CorpusVersion struct {
ID uuid.UUID `json:"id"`
Version string `json:"version"` // "2026-03-02.1"
CollectionName string `json:"collection_name"` // "bp_compliance_ce"
DocumentsCount int `json:"documents_count"`
ChunksCount int `json:"chunks_count"`
Regulations []string `json:"regulations"` // ["eu_2016_679", ...]
Digest string `json:"digest,omitempty"` // SHA256 over chunks
IngestionSource string `json:"ingestion_source,omitempty"`
Notes string `json:"notes,omitempty"`
CreatedAt time.Time `json:"created_at"`
CreatedBy string `json:"created_by,omitempty"`
}
// CorpusVersionStore handles corpus version persistence.
type CorpusVersionStore struct {
pool *pgxpool.Pool
}
// NewCorpusVersionStore creates a new corpus version store.
func NewCorpusVersionStore(pool *pgxpool.Pool) *CorpusVersionStore {
return &CorpusVersionStore{pool: pool}
}
// CreateCorpusVersion inserts a new corpus version record.
func (s *CorpusVersionStore) CreateCorpusVersion(ctx context.Context, v *CorpusVersion) error {
if v.ID == uuid.Nil {
v.ID = uuid.New()
}
if v.CreatedAt.IsZero() {
v.CreatedAt = time.Now().UTC()
}
_, err := s.pool.Exec(ctx, `
INSERT INTO compliance_corpus_versions (
id, version, collection_name, documents_count, chunks_count,
regulations, digest, ingestion_source, notes, created_at, created_by
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
`,
v.ID, v.Version, v.CollectionName, v.DocumentsCount, v.ChunksCount,
v.Regulations, v.Digest, v.IngestionSource, v.Notes, v.CreatedAt, v.CreatedBy,
)
return err
}
// GetLatestCorpusVersion returns the most recent version for a collection.
func (s *CorpusVersionStore) GetLatestCorpusVersion(ctx context.Context, collection string) (*CorpusVersion, error) {
var v CorpusVersion
err := s.pool.QueryRow(ctx, `
SELECT id, version, collection_name, documents_count, chunks_count,
regulations, digest, ingestion_source, notes, created_at, created_by
FROM compliance_corpus_versions
WHERE collection_name = $1
ORDER BY created_at DESC
LIMIT 1
`, collection).Scan(
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
return &v, nil
}
// GetCorpusVersionByID retrieves a specific corpus version by ID.
func (s *CorpusVersionStore) GetCorpusVersionByID(ctx context.Context, id uuid.UUID) (*CorpusVersion, error) {
var v CorpusVersion
err := s.pool.QueryRow(ctx, `
SELECT id, version, collection_name, documents_count, chunks_count,
regulations, digest, ingestion_source, notes, created_at, created_by
FROM compliance_corpus_versions
WHERE id = $1
`, id).Scan(
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
)
if err == pgx.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
return &v, nil
}
// ListCorpusVersions returns all versions for a collection, newest first.
func (s *CorpusVersionStore) ListCorpusVersions(ctx context.Context, collection string) ([]CorpusVersion, error) {
rows, err := s.pool.Query(ctx, `
SELECT id, version, collection_name, documents_count, chunks_count,
regulations, digest, ingestion_source, notes, created_at, created_by
FROM compliance_corpus_versions
WHERE collection_name = $1
ORDER BY created_at DESC
`, collection)
if err != nil {
return nil, err
}
defer rows.Close()
var versions []CorpusVersion
for rows.Next() {
var v CorpusVersion
err := rows.Scan(
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
)
if err != nil {
return nil, err
}
versions = append(versions, v)
}
return versions, nil
}
// GetAllLatestVersions returns the latest version for every collection.
func (s *CorpusVersionStore) GetAllLatestVersions(ctx context.Context) ([]CorpusVersion, error) {
rows, err := s.pool.Query(ctx, `
SELECT DISTINCT ON (collection_name)
id, version, collection_name, documents_count, chunks_count,
regulations, digest, ingestion_source, notes, created_at, created_by
FROM compliance_corpus_versions
ORDER BY collection_name, created_at DESC
`)
if err != nil {
return nil, err
}
defer rows.Close()
var versions []CorpusVersion
for rows.Next() {
var v CorpusVersion
err := rows.Scan(
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
)
if err != nil {
return nil, err
}
versions = append(versions, v)
}
return versions, nil
}

View File

@@ -471,6 +471,10 @@ type Assessment struct {
Art22Risk bool `json:"art22_risk"`
TrainingAllowed TrainingAllowed `json:"training_allowed"`
// Corpus Versioning (RAG)
CorpusVersionID *uuid.UUID `json:"corpus_version_id,omitempty"`
CorpusVersion string `json:"corpus_version,omitempty"`
// LLM Explanation (optional)
ExplanationText *string `json:"explanation_text,omitempty"`
ExplanationGeneratedAt *time.Time `json:"explanation_generated_at,omitempty"`

View File

@@ -52,6 +52,7 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
triggered_rules, required_controls, recommended_architecture,
forbidden_patterns, example_matches,
dsfa_recommended, art22_risk, training_allowed,
corpus_version_id, corpus_version,
explanation_text, explanation_generated_at, explanation_model,
domain, created_at, updated_at, created_by
) VALUES (
@@ -61,8 +62,9 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
$14, $15, $16,
$17, $18,
$19, $20, $21,
$22, $23, $24,
$25, $26, $27, $28
$22, $23,
$24, $25, $26,
$27, $28, $29, $30
)
`,
a.ID, a.TenantID, a.NamespaceID, a.Title, a.PolicyVersion, a.Status,
@@ -71,6 +73,7 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
triggeredRules, requiredControls, recommendedArchitecture,
forbiddenPatterns, exampleMatches,
a.DSFARecommended, a.Art22Risk, string(a.TrainingAllowed),
a.CorpusVersionID, a.CorpusVersion,
a.ExplanationText, a.ExplanationGeneratedAt, a.ExplanationModel,
string(a.Domain), a.CreatedAt, a.UpdatedAt, a.CreatedBy,
)
@@ -92,6 +95,7 @@ func (s *Store) GetAssessment(ctx context.Context, id uuid.UUID) (*Assessment, e
triggered_rules, required_controls, recommended_architecture,
forbidden_patterns, example_matches,
dsfa_recommended, art22_risk, training_allowed,
corpus_version_id, corpus_version,
explanation_text, explanation_generated_at, explanation_model,
domain, created_at, updated_at, created_by
FROM ucca_assessments WHERE id = $1
@@ -102,6 +106,7 @@ func (s *Store) GetAssessment(ctx context.Context, id uuid.UUID) (*Assessment, e
&triggeredRules, &requiredControls, &recommendedArchitecture,
&forbiddenPatterns, &exampleMatches,
&a.DSFARecommended, &a.Art22Risk, &trainingAllowed,
&a.CorpusVersionID, &a.CorpusVersion,
&a.ExplanationText, &a.ExplanationGeneratedAt, &a.ExplanationModel,
&domain, &a.CreatedAt, &a.UpdatedAt, &a.CreatedBy,
)
@@ -141,6 +146,7 @@ func (s *Store) ListAssessments(ctx context.Context, tenantID uuid.UUID, filters
triggered_rules, required_controls, recommended_architecture,
forbidden_patterns, example_matches,
dsfa_recommended, art22_risk, training_allowed,
corpus_version_id, corpus_version,
explanation_text, explanation_generated_at, explanation_model,
domain, created_at, updated_at, created_by
FROM ucca_assessments WHERE tenant_id = $1`
@@ -194,6 +200,7 @@ func (s *Store) ListAssessments(ctx context.Context, tenantID uuid.UUID, filters
&triggeredRules, &requiredControls, &recommendedArchitecture,
&forbiddenPatterns, &exampleMatches,
&a.DSFARecommended, &a.Art22Risk, &trainingAllowed,
&a.CorpusVersionID, &a.CorpusVersion,
&a.ExplanationText, &a.ExplanationGeneratedAt, &a.ExplanationModel,
&domain, &a.CreatedAt, &a.UpdatedAt, &a.CreatedBy,
)

View File

@@ -0,0 +1,35 @@
-- =============================================================================
-- Migration 017: RAG Corpus Versioning
--
-- Tracks versions of the RAG corpus so assessments can record which
-- corpus version they were evaluated against. Enables staleness detection
-- and re-evaluation recommendations.
-- =============================================================================
CREATE TABLE IF NOT EXISTS compliance_corpus_versions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
version VARCHAR(50) NOT NULL, -- "2026-03-02.1"
collection_name VARCHAR(100) NOT NULL, -- "bp_compliance_ce"
documents_count INT NOT NULL DEFAULT 0,
chunks_count INT NOT NULL DEFAULT 0,
regulations TEXT[], -- {"eu_2016_679", "eu_2024_1689"}
digest VARCHAR(128), -- SHA256 over all chunks
ingestion_source VARCHAR(200), -- "ingest-legal-corpus.sh"
notes TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_by VARCHAR(100)
);
CREATE INDEX IF NOT EXISTS idx_corpus_versions_collection
ON compliance_corpus_versions(collection_name);
CREATE INDEX IF NOT EXISTS idx_corpus_versions_latest
ON compliance_corpus_versions(collection_name, created_at DESC);
-- Add corpus_version_id to ucca_assessments
ALTER TABLE ucca_assessments
ADD COLUMN IF NOT EXISTS corpus_version_id UUID REFERENCES compliance_corpus_versions(id),
ADD COLUMN IF NOT EXISTS corpus_version VARCHAR(50);
CREATE INDEX IF NOT EXISTS idx_ucca_assessments_corpus_version
ON ucca_assessments(corpus_version_id);

View File

@@ -0,0 +1,503 @@
"""
Source Policy Router — Manages allowed compliance data sources.
Controls which legal sources the RAG corpus may use,
operations matrix, PII rules, and provides audit trail.
Endpoints:
GET /api/v1/admin/sources — List all sources
POST /api/v1/admin/sources — Add new source
GET /api/v1/admin/sources/{id} — Get source by ID
PUT /api/v1/admin/sources/{id} — Update source
DELETE /api/v1/admin/sources/{id} — Remove source
GET /api/v1/admin/operations-matrix — Operations matrix
PUT /api/v1/admin/operations/{id} — Update operation
GET /api/v1/admin/pii-rules — List PII rules
POST /api/v1/admin/pii-rules — Create PII rule
PUT /api/v1/admin/pii-rules/{id} — Update PII rule
DELETE /api/v1/admin/pii-rules/{id} — Delete PII rule
GET /api/v1/admin/policy-audit — Audit trail
GET /api/v1/admin/policy-stats — Dashboard statistics
GET /api/v1/admin/compliance-report — Compliance report
"""
import uuid
from datetime import datetime
from typing import Optional, List
from fastapi import APIRouter, HTTPException, Depends, Query
from pydantic import BaseModel, Field
from sqlalchemy.orm import Session
from database import get_db
from compliance.db.source_policy_models import (
AllowedSourceDB,
SourceOperationDB,
PIIRuleDB,
SourcePolicyAuditDB,
)
router = APIRouter(prefix="/v1/admin", tags=["source-policy"])
# =============================================================================
# Pydantic Schemas
# =============================================================================
class SourceCreate(BaseModel):
domain: str
name: str
description: Optional[str] = None
license: Optional[str] = None
legal_basis: Optional[str] = None
trust_boost: float = Field(default=0.5, ge=0.0, le=1.0)
source_type: str = "legal"
active: bool = True
metadata: Optional[dict] = None
class SourceUpdate(BaseModel):
domain: Optional[str] = None
name: Optional[str] = None
description: Optional[str] = None
license: Optional[str] = None
legal_basis: Optional[str] = None
trust_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0)
source_type: Optional[str] = None
active: Optional[bool] = None
metadata: Optional[dict] = None
class SourceResponse(BaseModel):
id: str
domain: str
name: str
description: Optional[str] = None
license: Optional[str] = None
legal_basis: Optional[str] = None
trust_boost: float
source_type: str
active: bool
metadata: Optional[dict] = None
created_at: str
updated_at: Optional[str] = None
class Config:
from_attributes = True
class OperationUpdate(BaseModel):
allowed: bool
conditions: Optional[str] = None
class PIIRuleCreate(BaseModel):
name: str
description: Optional[str] = None
pattern: Optional[str] = None
category: str
action: str = "mask"
active: bool = True
class PIIRuleUpdate(BaseModel):
name: Optional[str] = None
description: Optional[str] = None
pattern: Optional[str] = None
category: Optional[str] = None
action: Optional[str] = None
active: Optional[bool] = None
# =============================================================================
# Helper: Audit logging
# =============================================================================
def _log_audit(db: Session, action: str, entity_type: str, entity_id, old_values=None, new_values=None):
audit = SourcePolicyAuditDB(
action=action,
entity_type=entity_type,
entity_id=entity_id,
old_values=old_values,
new_values=new_values,
user_id="system",
)
db.add(audit)
def _source_to_dict(source: AllowedSourceDB) -> dict:
return {
"id": str(source.id),
"domain": source.domain,
"name": source.name,
"description": source.description,
"license": source.license,
"legal_basis": source.legal_basis,
"trust_boost": source.trust_boost,
"source_type": source.source_type,
"active": source.active,
}
# =============================================================================
# Sources CRUD
# =============================================================================
@router.get("/sources")
async def list_sources(
active_only: bool = Query(False),
db: Session = Depends(get_db),
):
"""List all allowed sources."""
query = db.query(AllowedSourceDB)
if active_only:
query = query.filter(AllowedSourceDB.active == True)
sources = query.order_by(AllowedSourceDB.name).all()
return {
"sources": [
{
"id": str(s.id),
"domain": s.domain,
"name": s.name,
"description": s.description,
"license": s.license,
"legal_basis": s.legal_basis,
"trust_boost": s.trust_boost,
"source_type": s.source_type,
"active": s.active,
"metadata": s.metadata_,
"created_at": s.created_at.isoformat() if s.created_at else None,
"updated_at": s.updated_at.isoformat() if s.updated_at else None,
}
for s in sources
],
"count": len(sources),
}
@router.post("/sources")
async def create_source(
data: SourceCreate,
db: Session = Depends(get_db),
):
"""Add a new allowed source."""
existing = db.query(AllowedSourceDB).filter(AllowedSourceDB.domain == data.domain).first()
if existing:
raise HTTPException(status_code=409, detail=f"Source with domain '{data.domain}' already exists")
source = AllowedSourceDB(
domain=data.domain,
name=data.name,
description=data.description,
license=data.license,
legal_basis=data.legal_basis,
trust_boost=data.trust_boost,
source_type=data.source_type,
active=data.active,
metadata_=data.metadata,
)
db.add(source)
_log_audit(db, "create", "source", source.id, new_values=_source_to_dict(source))
db.commit()
db.refresh(source)
return {
"id": str(source.id),
"domain": source.domain,
"name": source.name,
"created_at": source.created_at.isoformat(),
}
@router.get("/sources/{source_id}")
async def get_source(source_id: str, db: Session = Depends(get_db)):
"""Get a specific source."""
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
if not source:
raise HTTPException(status_code=404, detail="Source not found")
return {
"id": str(source.id),
"domain": source.domain,
"name": source.name,
"description": source.description,
"license": source.license,
"legal_basis": source.legal_basis,
"trust_boost": source.trust_boost,
"source_type": source.source_type,
"active": source.active,
"metadata": source.metadata_,
"created_at": source.created_at.isoformat() if source.created_at else None,
"updated_at": source.updated_at.isoformat() if source.updated_at else None,
}
@router.put("/sources/{source_id}")
async def update_source(
source_id: str,
data: SourceUpdate,
db: Session = Depends(get_db),
):
"""Update an existing source."""
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
if not source:
raise HTTPException(status_code=404, detail="Source not found")
old_values = _source_to_dict(source)
update_data = data.model_dump(exclude_unset=True)
# Rename metadata to metadata_ for the DB column
if "metadata" in update_data:
update_data["metadata_"] = update_data.pop("metadata")
for key, value in update_data.items():
setattr(source, key, value)
_log_audit(db, "update", "source", source.id, old_values=old_values, new_values=update_data)
db.commit()
db.refresh(source)
return {"status": "updated", "id": str(source.id)}
@router.delete("/sources/{source_id}")
async def delete_source(source_id: str, db: Session = Depends(get_db)):
"""Remove an allowed source."""
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
if not source:
raise HTTPException(status_code=404, detail="Source not found")
old_values = _source_to_dict(source)
_log_audit(db, "delete", "source", source.id, old_values=old_values)
# Also delete associated operations
db.query(SourceOperationDB).filter(SourceOperationDB.source_id == source_id).delete()
db.delete(source)
db.commit()
return {"status": "deleted", "id": source_id}
# =============================================================================
# Operations Matrix
# =============================================================================
@router.get("/operations-matrix")
async def get_operations_matrix(db: Session = Depends(get_db)):
"""Get the full operations matrix."""
operations = db.query(SourceOperationDB).all()
return {
"operations": [
{
"id": str(op.id),
"source_id": str(op.source_id),
"operation": op.operation,
"allowed": op.allowed,
"conditions": op.conditions,
}
for op in operations
],
"count": len(operations),
}
@router.put("/operations/{operation_id}")
async def update_operation(
operation_id: str,
data: OperationUpdate,
db: Session = Depends(get_db),
):
"""Update an operation in the matrix."""
op = db.query(SourceOperationDB).filter(SourceOperationDB.id == operation_id).first()
if not op:
raise HTTPException(status_code=404, detail="Operation not found")
op.allowed = data.allowed
if data.conditions is not None:
op.conditions = data.conditions
_log_audit(db, "update", "operation", op.id, new_values={"allowed": data.allowed})
db.commit()
return {"status": "updated", "id": str(op.id)}
# =============================================================================
# PII Rules
# =============================================================================
@router.get("/pii-rules")
async def list_pii_rules(db: Session = Depends(get_db)):
"""List all PII rules."""
rules = db.query(PIIRuleDB).order_by(PIIRuleDB.category, PIIRuleDB.name).all()
return {
"rules": [
{
"id": str(r.id),
"name": r.name,
"description": r.description,
"pattern": r.pattern,
"category": r.category,
"action": r.action,
"active": r.active,
"created_at": r.created_at.isoformat() if r.created_at else None,
}
for r in rules
],
"count": len(rules),
}
@router.post("/pii-rules")
async def create_pii_rule(data: PIIRuleCreate, db: Session = Depends(get_db)):
"""Create a new PII rule."""
rule = PIIRuleDB(
name=data.name,
description=data.description,
pattern=data.pattern,
category=data.category,
action=data.action,
active=data.active,
)
db.add(rule)
_log_audit(db, "create", "pii_rule", rule.id, new_values={"name": data.name, "category": data.category})
db.commit()
db.refresh(rule)
return {"id": str(rule.id), "name": rule.name}
@router.put("/pii-rules/{rule_id}")
async def update_pii_rule(rule_id: str, data: PIIRuleUpdate, db: Session = Depends(get_db)):
"""Update a PII rule."""
rule = db.query(PIIRuleDB).filter(PIIRuleDB.id == rule_id).first()
if not rule:
raise HTTPException(status_code=404, detail="PII rule not found")
update_data = data.model_dump(exclude_unset=True)
for key, value in update_data.items():
setattr(rule, key, value)
_log_audit(db, "update", "pii_rule", rule.id, new_values=update_data)
db.commit()
return {"status": "updated", "id": str(rule.id)}
@router.delete("/pii-rules/{rule_id}")
async def delete_pii_rule(rule_id: str, db: Session = Depends(get_db)):
"""Delete a PII rule."""
rule = db.query(PIIRuleDB).filter(PIIRuleDB.id == rule_id).first()
if not rule:
raise HTTPException(status_code=404, detail="PII rule not found")
_log_audit(db, "delete", "pii_rule", rule.id, old_values={"name": rule.name, "category": rule.category})
db.delete(rule)
db.commit()
return {"status": "deleted", "id": rule_id}
# =============================================================================
# Audit Trail
# =============================================================================
@router.get("/policy-audit")
async def get_policy_audit(
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
entity_type: Optional[str] = None,
db: Session = Depends(get_db),
):
"""Get the audit trail for source policy changes."""
query = db.query(SourcePolicyAuditDB)
if entity_type:
query = query.filter(SourcePolicyAuditDB.entity_type == entity_type)
total = query.count()
entries = query.order_by(SourcePolicyAuditDB.created_at.desc()).offset(offset).limit(limit).all()
return {
"entries": [
{
"id": str(e.id),
"action": e.action,
"entity_type": e.entity_type,
"entity_id": str(e.entity_id) if e.entity_id else None,
"old_values": e.old_values,
"new_values": e.new_values,
"user_id": e.user_id,
"created_at": e.created_at.isoformat() if e.created_at else None,
}
for e in entries
],
"total": total,
"limit": limit,
"offset": offset,
}
# =============================================================================
# Dashboard Statistics
# =============================================================================
@router.get("/policy-stats")
async def get_policy_stats(db: Session = Depends(get_db)):
"""Get dashboard statistics for source policy."""
total_sources = db.query(AllowedSourceDB).count()
active_sources = db.query(AllowedSourceDB).filter(AllowedSourceDB.active == True).count()
pii_rules = db.query(PIIRuleDB).filter(PIIRuleDB.active == True).count()
# Count audit entries from today
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
blocked_today = db.query(SourcePolicyAuditDB).filter(
SourcePolicyAuditDB.action == "delete",
SourcePolicyAuditDB.created_at >= today_start,
).count()
blocked_total = db.query(SourcePolicyAuditDB).filter(
SourcePolicyAuditDB.action == "delete",
).count()
return {
"active_policies": active_sources,
"allowed_sources": total_sources,
"pii_rules": pii_rules,
"blocked_today": blocked_today,
"blocked_total": blocked_total,
}
@router.get("/compliance-report")
async def get_compliance_report(db: Session = Depends(get_db)):
"""Generate a compliance report for source policies."""
sources = db.query(AllowedSourceDB).filter(AllowedSourceDB.active == True).all()
pii_rules = db.query(PIIRuleDB).filter(PIIRuleDB.active == True).all()
return {
"report_date": datetime.utcnow().isoformat(),
"summary": {
"active_sources": len(sources),
"active_pii_rules": len(pii_rules),
"source_types": list(set(s.source_type for s in sources)),
"licenses": list(set(s.license for s in sources if s.license)),
},
"sources": [
{
"domain": s.domain,
"name": s.name,
"license": s.license,
"legal_basis": s.legal_basis,
"trust_boost": s.trust_boost,
}
for s in sources
],
"pii_rules": [
{
"name": r.name,
"category": r.category,
"action": r.action,
}
for r in pii_rules
],
}

View File

@@ -0,0 +1,105 @@
"""
SQLAlchemy models for Source Policy Management.
Tables:
- compliance_allowed_sources: Whitelisted data sources for RAG corpus
- compliance_source_operations: Operations matrix for source data flows
- compliance_pii_rules: PII detection/masking rules for sources
- compliance_source_policy_audit: Audit trail for source policy changes
"""
import uuid
from datetime import datetime
from sqlalchemy import (
Column, String, Text, Boolean, DateTime, Float, JSON, Index
)
from sqlalchemy.dialects.postgresql import UUID
from classroom_engine.database import Base
class AllowedSourceDB(Base):
"""Whitelisted data source for compliance RAG corpus."""
__tablename__ = 'compliance_allowed_sources'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
domain = Column(String(255), unique=True, nullable=False)
name = Column(String(255), nullable=False)
description = Column(Text, nullable=True)
license = Column(String(100), nullable=True) # DL-DE-BY-2.0, CC-BY, etc.
legal_basis = Column(String(200), nullable=True) # §5 UrhG, etc.
trust_boost = Column(Float, default=0.5)
source_type = Column(String(50), default='legal') # legal, guidance, template
active = Column(Boolean, default=True)
metadata_ = Column('metadata', JSON, nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
__table_args__ = (
Index('idx_allowed_sources_domain', 'domain'),
Index('idx_allowed_sources_active', 'active'),
)
def __repr__(self):
return f"<AllowedSource {self.domain}: {self.name}>"
class SourceOperationDB(Base):
"""Operations matrix entry for source data flows."""
__tablename__ = 'compliance_source_operations'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
source_id = Column(UUID(as_uuid=True), nullable=False)
operation = Column(String(50), nullable=False) # ingest, search, export, share
allowed = Column(Boolean, default=True)
conditions = Column(Text, nullable=True) # Conditions for this operation
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
__table_args__ = (
Index('idx_source_operations_source', 'source_id'),
)
class PIIRuleDB(Base):
"""PII detection and masking rule for compliance sources."""
__tablename__ = 'compliance_pii_rules'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
name = Column(String(255), nullable=False)
description = Column(Text, nullable=True)
pattern = Column(Text, nullable=True) # Regex pattern
category = Column(String(50), nullable=False) # email, phone, name, address, etc.
action = Column(String(20), default='mask') # mask, redact, flag
active = Column(Boolean, default=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
__table_args__ = (
Index('idx_pii_rules_category', 'category'),
Index('idx_pii_rules_active', 'active'),
)
class SourcePolicyAuditDB(Base):
"""Audit trail for source policy changes."""
__tablename__ = 'compliance_source_policy_audit'
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
action = Column(String(20), nullable=False) # create, update, delete
entity_type = Column(String(50), nullable=False) # source, operation, pii_rule
entity_id = Column(UUID(as_uuid=True), nullable=True)
old_values = Column(JSON, nullable=True)
new_values = Column(JSON, nullable=True)
user_id = Column(String(100), nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
__table_args__ = (
Index('idx_source_audit_entity', 'entity_type', 'entity_id'),
Index('idx_source_audit_created', 'created_at'),
)

View File

@@ -21,6 +21,9 @@ from dsr_admin_api import router as dsr_admin_router, templates_router as dsr_te
# Compliance framework sub-package
from compliance.api import router as compliance_framework_router
# Source Policy
from compliance.api.source_policy_router import router as source_policy_router
# Middleware
from middleware import (
RequestIDMiddleware,
@@ -85,6 +88,9 @@ app.include_router(dsr_templates_router, prefix="/api")
# Compliance Framework (regulations, controls, evidence, risks, audits, ISMS)
app.include_router(compliance_framework_router, prefix="/api")
# Source Policy (allowed sources, PII rules, audit)
app.include_router(source_policy_router, prefix="/api")
if __name__ == "__main__":
import uvicorn

View File

@@ -0,0 +1,73 @@
-- =============================================================================
-- Migration 001: Source Policy Tables
--
-- Tables for managing allowed compliance data sources, operations matrix,
-- PII rules, and audit trail.
-- =============================================================================
CREATE TABLE IF NOT EXISTS compliance_allowed_sources (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
domain VARCHAR(255) UNIQUE NOT NULL,
name VARCHAR(255) NOT NULL,
description TEXT,
license VARCHAR(100),
legal_basis VARCHAR(200),
trust_boost FLOAT DEFAULT 0.5,
source_type VARCHAR(50) DEFAULT 'legal',
active BOOLEAN DEFAULT true,
metadata JSON,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ
);
CREATE INDEX IF NOT EXISTS idx_allowed_sources_domain ON compliance_allowed_sources(domain);
CREATE INDEX IF NOT EXISTS idx_allowed_sources_active ON compliance_allowed_sources(active);
CREATE TABLE IF NOT EXISTS compliance_source_operations (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
source_id UUID NOT NULL REFERENCES compliance_allowed_sources(id) ON DELETE CASCADE,
operation VARCHAR(50) NOT NULL,
allowed BOOLEAN DEFAULT true,
conditions TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ
);
CREATE INDEX IF NOT EXISTS idx_source_operations_source ON compliance_source_operations(source_id);
CREATE TABLE IF NOT EXISTS compliance_pii_rules (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name VARCHAR(255) NOT NULL,
description TEXT,
pattern TEXT,
category VARCHAR(50) NOT NULL,
action VARCHAR(20) DEFAULT 'mask',
active BOOLEAN DEFAULT true,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ
);
CREATE INDEX IF NOT EXISTS idx_pii_rules_category ON compliance_pii_rules(category);
CREATE INDEX IF NOT EXISTS idx_pii_rules_active ON compliance_pii_rules(active);
CREATE TABLE IF NOT EXISTS compliance_source_policy_audit (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
action VARCHAR(20) NOT NULL,
entity_type VARCHAR(50) NOT NULL,
entity_id UUID,
old_values JSON,
new_values JSON,
user_id VARCHAR(100),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_source_audit_entity ON compliance_source_policy_audit(entity_type, entity_id);
CREATE INDEX IF NOT EXISTS idx_source_audit_created ON compliance_source_policy_audit(created_at);
-- Seed default PII rules
INSERT INTO compliance_pii_rules (name, category, pattern, action, description) VALUES
('E-Mail-Adresse', 'email', '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', 'mask', 'E-Mail-Adressen erkennen und maskieren'),
('Telefonnummer', 'phone', '(\+49|0)[0-9\s/-]{8,15}', 'mask', 'Deutsche Telefonnummern erkennen'),
('IBAN', 'financial', 'DE[0-9]{2}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{2}', 'redact', 'Deutsche IBAN-Nummern erkennen und entfernen'),
('Postadresse', 'address', '[0-9]{5}\s+[A-Z][a-z]', 'flag', 'Postleitzahlen mit Ortsnamen markieren')
ON CONFLICT DO NOTHING;

971
scripts/ingest-legal-corpus.sh Executable file
View File

@@ -0,0 +1,971 @@
#!/usr/bin/env bash
# =============================================================================
# BreakPilot Compliance — RAG Legal Corpus Ingestion
#
# Laedt 23 freie Rechtsquellen herunter und ingestiert sie in Qdrant
# via die Core RAG-API (Port 8097).
#
# Ausfuehrung auf dem Mac Mini:
# ~/rag-ingestion/ingest-legal-corpus.sh [--skip-download] [--only PHASE]
#
# Phasen: download, gesetze, eu, templates, datenschutz, verify
# =============================================================================
set -euo pipefail
# --- Configuration -----------------------------------------------------------
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}"
RAG_URL="https://localhost:8097/api/v1/documents/upload"
QDRANT_URL="http://localhost:6333"
SDK_URL="${SDK_URL:-https://localhost:8093}"
DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
# Counters
UPLOADED=0
FAILED=0
SKIPPED=0
# --- CLI Args ----------------------------------------------------------------
SKIP_DOWNLOAD=false
ONLY_PHASE=""
while [[ $# -gt 0 ]]; do
case $1 in
--skip-download) SKIP_DOWNLOAD=true; shift ;;
--only) ONLY_PHASE="$2"; shift 2 ;;
-h|--help)
echo "Usage: $0 [--skip-download] [--only PHASE]"
echo "Phases: download, gesetze, eu, templates, datenschutz, verify, version"
exit 0
;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
# --- Helpers -----------------------------------------------------------------
log() { echo "[$(date '+%H:%M:%S')] $*"; }
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
upload_file() {
local file="$1"
local collection="$2"
local data_type="$3"
local use_case="$4"
local year="$5"
local metadata_json="$6"
local label="${7:-$(basename "$file")}"
if [[ ! -f "$file" ]]; then
warn "File not found: $file"
FAILED=$((FAILED + 1))
return 1
fi
local filesize
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
if [[ "$filesize" -lt 100 ]]; then
warn "File too small (${filesize}B), skipping: $label"
SKIPPED=$((SKIPPED + 1))
return 1
fi
log "Uploading: $label$collection ($(( filesize / 1024 ))KB)"
local response
response=$(curl $CURL_OPTS -X POST "$RAG_URL" \
-F "file=@${file}" \
-F "collection=${collection}" \
-F "data_type=${data_type}" \
-F "use_case=${use_case}" \
-F "year=${year}" \
-F "chunk_strategy=recursive" \
-F "chunk_size=512" \
-F "chunk_overlap=50" \
-F "metadata_json=${metadata_json}" \
2>/dev/null) || true
if echo "$response" | grep -q '"chunks_count"'; then
local chunks
chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
ok "$label$chunks chunks"
UPLOADED=$((UPLOADED + 1))
elif echo "$response" | grep -q '"vectors_indexed"'; then
local vectors
vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
ok "$label$vectors vectors"
UPLOADED=$((UPLOADED + 1))
else
fail "Upload failed: $label"
fail "Response: $response"
FAILED=$((FAILED + 1))
return 1
fi
}
clone_repo() {
local url="$1"
local target="$2"
if [[ -d "$target" ]]; then
log "Repo exists: $target (skipping clone)"
return 0
fi
log "Cloning: $url"
git clone --depth 1 "$url" "$target" 2>/dev/null || {
warn "Clone failed: $url"
return 1
}
}
download_pdf() {
local url="$1"
local target="$2"
if [[ -f "$target" ]]; then
log "PDF exists: $(basename "$target") (skipping)"
return 0
fi
log "Downloading: $(basename "$target")"
curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || {
warn "Download failed: $url"
return 1
}
}
# Extract text from gesetze-im-internet.de HTML page
extract_gesetz_html() {
local url="$1"
local output="$2"
local label="$3"
if [[ -f "$output" ]]; then
log "Text exists: $(basename "$output") (skipping)"
return 0
fi
log "Extracting: $label from gesetze-im-internet.de"
curl $CURL_OPTS -L "$url" 2>/dev/null \
| python3 -c "
import sys, codecs
# gesetze-im-internet.de uses ISO-8859-1 encoding
sys.stdin = codecs.getreader('iso-8859-1')(sys.stdin.buffer)
from html.parser import HTMLParser
class TextExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.text = []
self.in_content = False
self.skip = False
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if tag == 'div' and 'jnhtml' in attrs_dict.get('class', ''):
self.in_content = True
if tag in ('script', 'style', 'nav', 'header', 'footer'):
self.skip = True
def handle_endtag(self, tag):
if tag in ('script', 'style', 'nav', 'header', 'footer'):
self.skip = False
if tag in ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'li'):
self.text.append('\n')
def handle_data(self, data):
if not self.skip:
self.text.append(data)
parser = TextExtractor()
parser.feed(sys.stdin.read())
print(''.join(parser.text).strip())
" > "$output" || {
warn "Extraction failed: $label"
return 1
}
}
# Concatenate Markdown files from bundestag/gesetze repo for a specific law
concat_bundestag_gesetz() {
local gesetz_dir="$1"
local output="$2"
local label="$3"
if [[ ! -d "$gesetz_dir" ]]; then
warn "Gesetz directory not found: $gesetz_dir"
return 0
fi
log "Concatenating: $label"
{
echo "# $label"
echo ""
# Sort by paragraph number for correct ordering
find "$gesetz_dir" -name "*.md" -type f | sort | while read -r f; do
cat "$f"
echo ""
echo "---"
echo ""
done
} > "$output"
}
collection_count() {
local col="$1"
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
}
# =============================================================================
# PHASE A: Downloads
# =============================================================================
phase_download() {
log "=========================================="
log "PHASE A: Downloads (PDFs + Git-Repos)"
log "=========================================="
mkdir -p "$WORK_DIR"/{pdfs,repos,texts}
# --- A1: EUR-Lex PDFs ---
log "--- EUR-Lex PDFs ---"
download_pdf \
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \
"$WORK_DIR/pdfs/dsa_2022_2065.pdf"
download_pdf \
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \
"$WORK_DIR/pdfs/eprivacy_2002_58.pdf"
download_pdf \
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \
"$WORK_DIR/pdfs/scc_2021_914.pdf"
# --- A2: Deutsche Gesetze (Einzelparagraphen) ---
log "--- Deutsche Gesetze (Einzelparagraphen) ---"
extract_gesetz_html \
"https://www.gesetze-im-internet.de/ddg/__5.html" \
"$WORK_DIR/texts/ddg_5.txt" \
"DDG § 5 (Impressum)"
# TDDDG heisst auf gesetze-im-internet.de noch "ttdsg"
extract_gesetz_html \
"https://www.gesetze-im-internet.de/ttdsg/__25.html" \
"$WORK_DIR/texts/tdddg_25.txt" \
"TDDDG § 25 (Cookies)"
extract_gesetz_html \
"https://www.gesetze-im-internet.de/urhg/__5.html" \
"$WORK_DIR/texts/urhg_5.txt" \
"UrhG § 5 (Amtliche Werke)"
# EGBGB Art. 246a § 1 (enthaelt Verweis auf Muster-Widerrufsbelehrung)
extract_gesetz_html \
"https://www.gesetze-im-internet.de/bgbeg/art_246a__1.html" \
"$WORK_DIR/texts/egbgb_widerruf.txt" \
"EGBGB Muster-Widerrufsbelehrung"
# --- A3: Git-Repos ---
log "--- Git-Repos ---"
clone_repo "https://github.com/bundestag/gesetze.git" \
"$WORK_DIR/repos/bundestag-gesetze"
clone_repo "https://github.com/github/site-policy.git" \
"$WORK_DIR/repos/github-site-policy"
clone_repo "https://github.com/opengovfoundation/site-policy.git" \
"$WORK_DIR/repos/opengov-site-policy"
clone_repo "https://github.com/creativecommons/cc-legal-tools-data.git" \
"$WORK_DIR/repos/cc-legal-tools"
clone_repo "https://github.com/oprvc/oprvc.github.io.git" \
"$WORK_DIR/repos/oprvc"
clone_repo "https://github.com/webflorist/privacy-policy-text.git" \
"$WORK_DIR/repos/webflorist"
clone_repo "https://github.com/Tempest-Solutions-Company/privacy-policy-generator.git" \
"$WORK_DIR/repos/tempest-privacy" || true
clone_repo "https://github.com/Tempest-Solutions-Company/terms-of-service-generator.git" \
"$WORK_DIR/repos/tempest-tos" || true
clone_repo "https://github.com/Tempest-Solutions-Company/cookie-banner-consent-solution.git" \
"$WORK_DIR/repos/tempest-cookie" || true
clone_repo "https://github.com/orestbida/cookieconsent.git" \
"$WORK_DIR/repos/cookieconsent" || true
# CommonPaper hat separate Repos pro Vertragstyp
clone_repo "https://github.com/CommonPaper/CSA.git" \
"$WORK_DIR/repos/common-paper-csa" || true
clone_repo "https://github.com/CommonPaper/SLA.git" \
"$WORK_DIR/repos/common-paper-sla" || true
clone_repo "https://github.com/CommonPaper/PSA.git" \
"$WORK_DIR/repos/common-paper-psa" || true
# OpenCode.de (Datennutzungsklauseln) - try HTTPS
clone_repo "https://gitlab.opencode.de/wernerth/datennutzungsklauseln-muster.git" \
"$WORK_DIR/repos/datennutzungsklauseln" || true
# --- A4: EDPB/EDPS PDFs (verifizierte URLs) ---
log "--- EDPB/EDPS Guidance PDFs ---"
# EDPB Guidelines 05/2020 on Consent
download_pdf \
"https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf" \
"$WORK_DIR/pdfs/edpb_consent_guidelines.pdf"
# EDPB Guidelines 4/2019 Data Protection by Design and Default
download_pdf \
"https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf" \
"$WORK_DIR/pdfs/edpb_privacy_by_design.pdf"
# EDPB Guidelines 03/2022 Dark Patterns
download_pdf \
"https://www.edpb.europa.eu/system/files/2023-02/edpb_03-2022_guidelines_on_deceptive_design_patterns_in_social_media_platform_interfaces_v2_en_0.pdf" \
"$WORK_DIR/pdfs/edpb_dark_patterns.pdf"
# EDPB Guidelines 8/2020 Social Media Targeting
download_pdf \
"https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf" \
"$WORK_DIR/pdfs/edpb_social_media_targeting.pdf"
# EDPB Cookie Banner Taskforce Report (Jan 2023)
download_pdf \
"https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf" \
"$WORK_DIR/pdfs/edpb_cookie_banner_taskforce.pdf"
# EDPB Guidelines 2/2023 ePrivacy Art. 5(3) Technical Scope
download_pdf \
"https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_v2_en_0.pdf" \
"$WORK_DIR/pdfs/edpb_eprivacy_art53.pdf"
# EDPB Guidelines 1/2024 Legitimate Interest
download_pdf \
"https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimateinterest_en.pdf" \
"$WORK_DIR/pdfs/edpb_legitimate_interest.pdf"
# EDPB DPO Coordinated Enforcement Report 2024
download_pdf \
"https://www.edpb.europa.eu/system/files/2024-01/edpb_report_20240116_cef_dpo_en.pdf" \
"$WORK_DIR/pdfs/edpb_dpo_report.pdf"
# EDPS GenAI Orientations (June 2024)
download_pdf \
"https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf" \
"$WORK_DIR/pdfs/edps_generative_ai.pdf"
# EDPS Digital Ethics Report (2018)
download_pdf \
"https://edps.europa.eu/sites/edp/files/publication/18-01-25_eag_report_en.pdf" \
"$WORK_DIR/pdfs/edps_digital_ethics.pdf"
# --- A5: Text-Extraktion aus Repos ---
log "--- Text-Extraktion aus Repos ---"
# Bundestag/gesetze: Verfuegbare Gesetze (Repo ist teilweise veraltet)
# DDG, TDDDG, EGBGB fehlen im Repo - nur BGB, UrhG, TMG vorhanden
local -a bundestag_gesetze=(
"b/bgb:BGB"
"u/urhg:UrhG"
"t/tmg:TMG"
)
for entry in "${bundestag_gesetze[@]}"; do
local path="${entry%%:*}"
local label="${entry##*:}"
local gesetz_dir="$WORK_DIR/repos/bundestag-gesetze/$path"
if [[ -d "$gesetz_dir" ]]; then
local name
name=$(echo "$label" | tr '[:upper:]' '[:lower:]')
concat_bundestag_gesetz "$gesetz_dir" \
"$WORK_DIR/texts/bundestag_${name}_komplett.txt" \
"$label (komplett)"
else
warn "Bundestag Gesetz nicht gefunden: $gesetz_dir"
fi
done
log "Download phase complete."
}
# =============================================================================
# PHASE B: Deutsche Gesetze → bp_compliance_gesetze
# =============================================================================
phase_gesetze() {
log "=========================================="
log "PHASE B: Deutsche Gesetze → bp_compliance_gesetze"
log "=========================================="
local col="bp_compliance_gesetze"
local before
before=$(collection_count "$col")
log "Collection $col: $before chunks (before)"
# B1: Einzelparagraphen
upload_file "$WORK_DIR/texts/ddg_5.txt" "$col" "compliance" "legal_reference" "2024" \
'{"regulation_id":"ddg_5","regulation_name_de":"Digitale-Dienste-Gesetz § 5","category":"impressum","license":"public_law","source":"gesetze-im-internet.de"}' \
"DDG § 5 (Impressumspflicht)"
upload_file "$WORK_DIR/texts/tdddg_25.txt" "$col" "compliance" "legal_reference" "2024" \
'{"regulation_id":"tdddg_25","regulation_name_de":"TDDDG § 25","category":"cookies","license":"public_law","source":"gesetze-im-internet.de"}' \
"TDDDG § 25 (Cookies/Endgeraetezugriff)"
upload_file "$WORK_DIR/texts/urhg_5.txt" "$col" "compliance" "legal_reference" "2024" \
'{"regulation_id":"urhg_5","regulation_name_de":"UrhG § 5","category":"urheberrecht","license":"public_law","source":"gesetze-im-internet.de"}' \
"UrhG § 5 (Amtliche Werke)"
upload_file "$WORK_DIR/texts/egbgb_widerruf.txt" "$col" "compliance" "legal_reference" "2024" \
'{"regulation_id":"egbgb_widerruf","regulation_name_de":"EGBGB Muster-Widerrufsbelehrung","category":"widerruf","license":"public_law","source":"gesetze-im-internet.de"}' \
"EGBGB Muster-Widerrufsbelehrung"
# B2: Bundestag/gesetze (komplett)
local -a bundestag_upload=(
"bgb:BGB:Buergerliches Gesetzbuch"
"urhg:UrhG:Urheberrechtsgesetz"
"tmg:TMG:Telemediengesetz"
)
for entry in "${bundestag_upload[@]}"; do
local gesetz="${entry%%:*}"
local rest="${entry#*:}"
local label="${rest%%:*}"
local fullname="${rest#*:}"
local file="$WORK_DIR/texts/bundestag_${gesetz}_komplett.txt"
if [[ -f "$file" ]]; then
upload_file "$file" "$col" "compliance" "legal_reference" "2024" \
"{\"regulation_id\":\"${gesetz}_komplett\",\"regulation_name_de\":\"$fullname ($label komplett)\",\"category\":\"volltext\",\"license\":\"unlicense\",\"source\":\"github.com/bundestag/gesetze\"}" \
"$label komplett (Bundestag-Repo)"
fi
done
local after
after=$(collection_count "$col")
log "Collection $col: $before$after chunks"
}
# =============================================================================
# PHASE C: EU-Rechtstexte → bp_compliance_ce
# =============================================================================
phase_eu() {
log "=========================================="
log "PHASE C: EU-Rechtstexte → bp_compliance_ce"
log "=========================================="
local col="bp_compliance_ce"
local before
before=$(collection_count "$col")
log "Collection $col: $before chunks (before)"
upload_file "$WORK_DIR/pdfs/dsa_2022_2065.pdf" "$col" "compliance_ce" "legal_reference" "2022" \
'{"regulation_id":"eu_2022_2065","regulation_name_de":"Digital Services Act (DSA)","regulation_name_en":"Digital Services Act","regulation_short":"DSA","category":"plattformregulierung","celex":"32022R2065","source":"eur-lex","license":"public_law"}' \
"Digital Services Act (EU) 2022/2065"
upload_file "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" "$col" "compliance_ce" "legal_reference" "2002" \
'{"regulation_id":"eu_2002_58","regulation_name_de":"ePrivacy-Richtlinie","regulation_name_en":"ePrivacy Directive","regulation_short":"ePrivacy","category":"datenschutz","celex":"32002L0058","source":"eur-lex","license":"public_law"}' \
"ePrivacy-Richtlinie 2002/58/EC"
upload_file "$WORK_DIR/pdfs/scc_2021_914.pdf" "$col" "compliance_ce" "legal_reference" "2021" \
'{"regulation_id":"eu_2021_914","regulation_name_de":"Standardvertragsklauseln (SCC)","regulation_name_en":"Standard Contractual Clauses","regulation_short":"SCC","category":"datentransfer","celex":"32021D0914","source":"eur-lex","license":"public_law"}' \
"Standardvertragsklauseln (EU) 2021/914"
local after
after=$(collection_count "$col")
log "Collection $col: $before$after chunks"
}
# =============================================================================
# PHASE D: Templates/Textbausteine → bp_legal_templates
# =============================================================================
phase_templates() {
log "=========================================="
log "PHASE D: Templates → bp_legal_templates"
log "=========================================="
local col="bp_legal_templates"
local before
before=$(collection_count "$col")
log "Collection $col: $before chunks (before)"
# --- D1: GitHub Site Policy (CC0) ---
local repo="$WORK_DIR/repos/github-site-policy"
if [[ -d "$repo" ]]; then
log "--- GitHub Site Policy ---"
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
local basename
basename=$(basename "$f" .md)
local doc_type="policy"
case "$basename" in
*terms*|*tos*|*service*) doc_type="tos" ;;
*privacy*|*data*) doc_type="privacy_policy" ;;
*dmca*|*copyright*) doc_type="dmca" ;;
*acceptable*|*use*) doc_type="acceptable_use" ;;
esac
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"github_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/github/site-policy\",\"filename\":\"$basename\"}" \
"GitHub: $basename"
done
fi
# --- D2: OpenGov Site Policy (CC0) ---
repo="$WORK_DIR/repos/opengov-site-policy"
if [[ -d "$repo" ]]; then
log "--- OpenGov Site Policy ---"
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
local basename
basename=$(basename "$f" .md)
local doc_type="policy"
case "$basename" in
*terms*|*tos*) doc_type="tos" ;;
*privacy*) doc_type="privacy_policy" ;;
esac
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"opengov_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/opengovfoundation/site-policy\",\"filename\":\"$basename\"}" \
"OpenGov: $basename"
done
fi
# --- D3: Creative Commons Legal Tools (CC0) ---
repo="$WORK_DIR/repos/cc-legal-tools"
if [[ -d "$repo" ]]; then
log "--- CC Legal Tools (ausgewaehlte Lizenztexte) ---"
# Only ingest the main license deeds (DE legalcode where available, else EN)
for license_dir in "$repo"/legalcode/de/CC0_1.0 "$repo"/legalcode/de/CC-BY_4.0 "$repo"/legalcode/de/CC-BY-SA_4.0; do
if [[ -d "$license_dir" ]]; then
find "$license_dir" -name "*.html" -o -name "*.txt" -o -name "*.md" 2>/dev/null | head -3 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
"CC License: $basename"
done
fi
done
# Fallback: try top-level legalcode files
find "$repo"/legalcode -maxdepth 2 -name "*4.0*legalcode*de*" -type f 2>/dev/null | head -5 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
"CC License: $basename"
done
fi
# --- D4: opr.vc DSGVO-Mustertexte (CC0) ---
repo="$WORK_DIR/repos/oprvc"
if [[ -d "$repo" ]]; then
log "--- opr.vc DSGVO-Mustertexte ---"
# Look for German privacy/DSGVO content
find "$repo" \( -name "*.md" -o -name "*.html" -o -name "*.txt" \) \
-not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
| grep -iE "(datenschutz|privacy|dsgvo|gdpr|impressum)" \
| head -20 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
"opr.vc: $basename"
done
# If no specific files found, try all markdown files
if [[ $(find "$repo" \( -name "*.md" -o -name "*.html" \) -not -path "*/.git/*" -not -name "README.md" | grep -ciE "(datenschutz|privacy|dsgvo|gdpr)" 2>/dev/null) -eq 0 ]]; then
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" | head -10 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
"opr.vc: $basename"
done
fi
fi
# --- D5: webflorist/privacy-policy-text (MIT) ---
repo="$WORK_DIR/repos/webflorist"
if [[ -d "$repo" ]]; then
log "--- webflorist Privacy Policy Text ---"
# Look for JSON/text building blocks (German)
find "$repo" \( -name "*.json" -o -name "*.txt" -o -name "*.md" -o -name "*.php" \) \
-not -path "*/.git/*" -not -path "*/node_modules/*" -not -name "package*.json" \
-not -name "composer.json" -not -name "README.md" 2>/dev/null \
| head -20 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"webflorist\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/webflorist/privacy-policy-text\",\"filename\":\"$basename\"}" \
"webflorist: $basename"
done
fi
# --- D6: Tempest Privacy Policy Generator (MIT) ---
repo="$WORK_DIR/repos/tempest-privacy"
if [[ -d "$repo" ]]; then
log "--- Tempest Privacy Policy Generator ---"
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
-not -path "*/.git/*" -not -path "*/node_modules/*" \
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
| head -15 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"tempest_privacy\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/privacy-policy-generator\",\"filename\":\"$basename\"}" \
"Tempest Privacy: $basename"
done
fi
# --- D7: Tempest Terms of Service Generator (MIT) ---
repo="$WORK_DIR/repos/tempest-tos"
if [[ -d "$repo" ]]; then
log "--- Tempest Terms of Service Generator ---"
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
-not -path "*/.git/*" -not -path "*/node_modules/*" \
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
| head -15 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"tempest_tos\",\"doc_type\":\"tos\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/terms-of-service-generator\",\"filename\":\"$basename\"}" \
"Tempest ToS: $basename"
done
fi
# --- D8: Tempest Cookie Banner (MIT) ---
repo="$WORK_DIR/repos/tempest-cookie"
if [[ -d "$repo" ]]; then
log "--- Tempest Cookie Banner ---"
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
-not -path "*/.git/*" -not -path "*/node_modules/*" \
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
| head -15 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"tempest_cookie\",\"doc_type\":\"cookie_banner\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/cookie-banner-consent-solution\",\"filename\":\"$basename\"}" \
"Tempest Cookie: $basename"
done
fi
# --- D9: CookieConsent (orestbida) - UI Strings (MIT) ---
repo="$WORK_DIR/repos/cookieconsent"
if [[ -d "$repo" ]]; then
log "--- CookieConsent UI Strings ---"
# Look for translation/language files
find "$repo" -path "*/translations/*" -o -path "*/languages/*" -o -path "*/i18n/*" -o -path "*/locales/*" 2>/dev/null \
| grep -iE "\.(json|js|ts)$" | head -10 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
"CookieConsent: $basename"
done
# Also check for example configs
find "$repo" -name "*.md" -path "*/docs/*" 2>/dev/null | head -5 | while read -r f; do
local basename
basename=$(basename "$f")
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
"CookieConsent Docs: $basename"
done
fi
# --- D10: Common Paper (CC BY 4.0) ---
log "--- Common Paper Standards ---"
local -a cp_repos=(
"common-paper-csa:saas_contract:CSA"
"common-paper-sla:sla:SLA"
"common-paper-psa:psa:PSA"
)
for entry in "${cp_repos[@]}"; do
local cp_dir="${entry%%:*}"
local rest="${entry#*:}"
local cp_doc_type="${rest%%:*}"
local cp_label="${rest#*:}"
repo="$WORK_DIR/repos/$cp_dir"
if [[ -d "$repo" ]]; then
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" \
-not -name "CONTRIBUTING.md" -not -name "CHANGELOG.md" -not -name "CODE_OF_CONDUCT.md" 2>/dev/null \
| head -10 | while read -r f; do
local basename
basename=$(basename "$f" .md)
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"common_paper\",\"doc_type\":\"$cp_doc_type\",\"license\":\"cc_by_4\",\"attribution\":\"Common Paper Inc., licensed under CC BY 4.0\",\"source\":\"github.com/CommonPaper/$cp_label\",\"filename\":\"$basename\"}" \
"CommonPaper $cp_label: $basename"
done
fi
done
# --- D11: Datennutzungsklauseln (CC BY 4.0) ---
repo="$WORK_DIR/repos/datennutzungsklauseln"
if [[ -d "$repo" ]]; then
log "--- Datennutzungsklauseln ---"
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
| head -15 | while read -r f; do
local basename
basename=$(basename "$f" .md)
upload_file "$f" "$col" "legal_template" "template" "2024" \
"{\"source_id\":\"datennutzungsklauseln\",\"doc_type\":\"data_clause\",\"license\":\"cc_by_4\",\"attribution\":\"OpenCode.de, lizenziert unter CC BY 4.0\",\"source\":\"gitlab.opencode.de/wernerth/datennutzungsklauseln-muster\",\"filename\":\"$basename\"}" \
"Datennutzungsklausel: $basename"
done
fi
local after
after=$(collection_count "$col")
log "Collection $col: $before$after chunks"
}
# =============================================================================
# PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz
# =============================================================================
phase_datenschutz() {
log "=========================================="
log "PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz"
log "=========================================="
local col="bp_compliance_datenschutz"
local before
before=$(collection_count "$col")
log "Collection $col: $before chunks (before)"
# EDPB Guidelines
for pdf in "$WORK_DIR"/pdfs/edpb_*.pdf; do
if [[ -f "$pdf" ]]; then
local basename
basename=$(basename "$pdf" .pdf)
local guideline_name="${basename#edpb_}"
guideline_name="${guideline_name//_/ }"
upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
"{\"source_id\":\"edpb\",\"doc_type\":\"guidance\",\"guideline_name\":\"$guideline_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Board (EDPB)\",\"source\":\"edpb.europa.eu\"}" \
"EDPB: $guideline_name"
fi
done
# EDPS Guidance
for pdf in "$WORK_DIR"/pdfs/edps_*.pdf; do
if [[ -f "$pdf" ]]; then
local basename
basename=$(basename "$pdf" .pdf)
local guidance_name="${basename#edps_}"
guidance_name="${guidance_name//_/ }"
upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
"{\"source_id\":\"edps\",\"doc_type\":\"guidance\",\"guidance_name\":\"$guidance_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Supervisor (EDPS)\",\"source\":\"edps.europa.eu\"}" \
"EDPS: $guidance_name"
fi
done
local after
after=$(collection_count "$col")
log "Collection $col: $before$after chunks"
}
# =============================================================================
# PHASE F: Verifizierung
# =============================================================================
phase_verify() {
log "=========================================="
log "PHASE F: Verifizierung"
log "=========================================="
echo ""
echo "=== Collection Stats ==="
for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
local count
count=$(collection_count "$col")
printf " %-30s %s chunks\n" "$col" "$count"
done
echo ""
echo "=== Test-Suchen ==="
log "Suche: 'Impressumspflicht digitale Dienste' in bp_compliance_gesetze"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"Impressumspflicht digitale Dienste","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
log "Suche: 'Cookie Einwilligung' in bp_compliance_ce"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"Cookie Einwilligung ePrivacy","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
log "Suche: 'Privacy Policy Template GDPR' in bp_legal_templates"
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
-H 'Content-Type: application/json' \
-d '{"query":"Privacy Policy Template GDPR","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
| python3 -c "
import sys,json
try:
data = json.load(sys.stdin)
results = data.get('results', [])
print(f' Treffer: {len(results)}')
for r in results[:3]:
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
except: print(' (parse error)')
" 2>/dev/null || echo " (search failed)"
echo ""
}
# =============================================================================
# PHASE G: Corpus Version Registration
# =============================================================================
phase_register_version() {
log "=========================================="
log "PHASE G: Corpus Version Registration"
log "=========================================="
local today
today=$(date '+%Y-%m-%d')
for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
local count
count=$(collection_count "$col")
if [[ "$count" == "?" || "$count" == "0" ]]; then
warn "Skipping version for $col (count=$count)"
continue
fi
# Determine next version number for today
local existing_count
existing_count=$(psql "$DB_URL" -tAc \
"SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \
2>/dev/null || echo "0")
local seq=$((existing_count + 1))
local version="${today}.${seq}"
# Get regulations list based on collection
local regs=""
case "$col" in
bp_compliance_ce)
regs='{eu_2022_2065,eu_2002_58,eu_2021_914}'
;;
bp_compliance_gesetze)
regs='{ddg_5,tdddg_25,urhg_5,egbgb_widerruf,bgb_komplett,urhg_komplett,tmg_komplett}'
;;
bp_legal_templates)
regs='{github_site_policy,opengov_site_policy,cc_legal_tools,common_paper,webflorist,tempest,cookieconsent}'
;;
bp_compliance_datenschutz)
regs='{edpb_consent,edpb_privacy_by_design,edpb_dark_patterns,edpb_social_media,edpb_cookie_banner,edps_generative_ai,edps_digital_ethics}'
;;
esac
# Compute digest from Qdrant collection info
local digest
digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
| python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \
2>/dev/null || echo "")
log "Registering version $version for $col ($count chunks)"
psql "$DB_URL" -c "
INSERT INTO compliance_corpus_versions
(version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by)
VALUES
('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-legal-corpus.sh', 'system')
" 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration failed for $col (DB not available?)"
done
}
# =============================================================================
# MAIN
# =============================================================================
main() {
log "=========================================="
log "BreakPilot Legal Corpus Ingestion"
log "=========================================="
log "Work dir: $WORK_DIR"
log "RAG API: $RAG_URL"
log "Qdrant: $QDRANT_URL"
echo ""
# Check RAG API is reachable
if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file"; then
fail "RAG API not reachable at $RAG_URL"
exit 1
fi
ok "RAG API reachable"
# Check Qdrant
if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then
fail "Qdrant not reachable at $QDRANT_URL"
exit 1
fi
ok "Qdrant reachable"
echo ""
# Run phases
if [[ -n "$ONLY_PHASE" ]]; then
case "$ONLY_PHASE" in
download) phase_download ;;
gesetze) phase_gesetze ;;
eu) phase_eu ;;
templates) phase_templates ;;
datenschutz) phase_datenschutz ;;
verify) phase_verify ;;
version) phase_register_version ;;
*) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
esac
else
if [[ "$SKIP_DOWNLOAD" != "true" ]]; then
phase_download
else
log "Skipping download phase (--skip-download)"
fi
echo ""
phase_gesetze
echo ""
phase_eu
echo ""
phase_templates
echo ""
phase_datenschutz
echo ""
phase_verify
echo ""
phase_register_version
fi
# Summary
echo ""
log "=========================================="
log "ERGEBNIS"
log "=========================================="
log "Uploaded: $UPLOADED"
log "Failed: $FAILED"
log "Skipped: $SKIPPED"
log "=========================================="
if [[ $FAILED -gt 0 ]]; then
warn "$FAILED uploads fehlgeschlagen!"
exit 1
fi
ok "Ingestion abgeschlossen!"
}
main "$@"