feat: add RAG corpus versioning and source policy backend
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 34s
CI / test-python-backend-compliance (push) Successful in 32s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 18s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 34s
CI / test-python-backend-compliance (push) Successful in 32s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 18s
Part 1 — RAG Corpus Versioning: - New DB table compliance_corpus_versions (migration 017) - Go CorpusVersionStore with CRUD operations - Assessment struct extended with corpus_version_id - API endpoints: GET /rag/corpus-status, /rag/corpus-versions/:collection - RAG routes (search, regulations) now registered in main.go - Ingestion script registers corpus versions after each run - Frontend staleness badge in SDK sidebar Part 3 — Source Policy Backend: - New FastAPI router with CRUD for allowed sources, PII rules, operations matrix, audit trail, stats, and compliance report - SQLAlchemy models for all source policy tables (migration 001) - Frontend API base corrected from edu-search:8088/8089 to backend-compliance:8002/api Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,8 +3,8 @@
|
||||
/**
|
||||
* Source Policy Management Page (SDK Version)
|
||||
*
|
||||
* Whitelist-based data source management for edu-search-service.
|
||||
* For auditors: Full audit trail for all changes.
|
||||
* Whitelist-based data source management for compliance RAG corpus.
|
||||
* Controls which legal sources may be used, PII rules, and audit trail.
|
||||
*/
|
||||
|
||||
import { useState, useEffect } from 'react'
|
||||
@@ -15,14 +15,14 @@ import { OperationsMatrixTab } from '@/components/sdk/source-policy/OperationsMa
|
||||
import { PIIRulesTab } from '@/components/sdk/source-policy/PIIRulesTab'
|
||||
import { AuditTab } from '@/components/sdk/source-policy/AuditTab'
|
||||
|
||||
// API base URL for edu-search-service
|
||||
// API base URL for backend-compliance
|
||||
const getApiBase = () => {
|
||||
if (typeof window === 'undefined') return 'http://localhost:8088'
|
||||
if (typeof window === 'undefined') return 'http://localhost:8002/api'
|
||||
const hostname = window.location.hostname
|
||||
if (hostname === 'localhost' || hostname === '127.0.0.1') {
|
||||
return 'http://localhost:8088'
|
||||
return 'http://localhost:8002/api'
|
||||
}
|
||||
return `https://${hostname}:8089`
|
||||
return `https://${hostname}:8002/api`
|
||||
}
|
||||
|
||||
interface PolicyStats {
|
||||
|
||||
@@ -10,6 +10,7 @@ import {
|
||||
getStepsForPackage,
|
||||
type SDKPackageId,
|
||||
type SDKStep,
|
||||
type RAGCorpusStatus,
|
||||
} from '@/lib/sdk'
|
||||
|
||||
// =============================================================================
|
||||
@@ -288,6 +289,41 @@ interface SDKSidebarProps {
|
||||
onCollapsedChange?: (collapsed: boolean) => void
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// CORPUS STALENESS INFO
|
||||
// =============================================================================
|
||||
|
||||
function CorpusStalenessInfo({ ragCorpusStatus }: { ragCorpusStatus: RAGCorpusStatus }) {
|
||||
const collections = ragCorpusStatus.collections
|
||||
const collectionNames = Object.keys(collections)
|
||||
if (collectionNames.length === 0) return null
|
||||
|
||||
// Check if corpus was updated after the last fetch (simplified: show last update time)
|
||||
const lastUpdated = collectionNames.reduce((latest, name) => {
|
||||
const updated = new Date(collections[name].last_updated)
|
||||
return updated > latest ? updated : latest
|
||||
}, new Date(0))
|
||||
|
||||
const daysSinceUpdate = Math.floor((Date.now() - lastUpdated.getTime()) / (1000 * 60 * 60 * 24))
|
||||
const totalChunks = collectionNames.reduce((sum, name) => sum + collections[name].chunks_count, 0)
|
||||
|
||||
return (
|
||||
<div className="px-4 py-2 border-b border-gray-100">
|
||||
<div className="flex items-center gap-2 text-xs">
|
||||
<div className={`w-2 h-2 rounded-full flex-shrink-0 ${daysSinceUpdate > 30 ? 'bg-amber-400' : 'bg-green-400'}`} />
|
||||
<span className="text-gray-500 truncate">
|
||||
RAG Corpus: {totalChunks} Chunks
|
||||
</span>
|
||||
</div>
|
||||
{daysSinceUpdate > 30 && (
|
||||
<div className="mt-1 text-xs text-amber-600 bg-amber-50 rounded px-2 py-1">
|
||||
Corpus {daysSinceUpdate}d alt — Re-Evaluation empfohlen
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarProps) {
|
||||
const pathname = usePathname()
|
||||
const { state, packageCompletion, completionPercentage, getCheckpointStatus } = useSDK()
|
||||
@@ -391,6 +427,11 @@ export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarP
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* RAG Corpus Staleness Badge */}
|
||||
{!collapsed && state.ragCorpusStatus && (
|
||||
<CorpusStalenessInfo ragCorpusStatus={state.ragCorpusStatus} />
|
||||
)}
|
||||
|
||||
{/* Navigation - 5 Packages */}
|
||||
<nav className="flex-1 overflow-y-auto">
|
||||
{SDK_PACKAGES.map(pkg => {
|
||||
@@ -510,6 +551,18 @@ export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarP
|
||||
isActive={pathname === '/sdk/dsms'}
|
||||
collapsed={collapsed}
|
||||
/>
|
||||
<AdditionalModuleItem
|
||||
href="/development/sdk-flow"
|
||||
icon={
|
||||
<svg className="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2}
|
||||
d="M13 10V3L4 14h7v7l9-11h-7z" />
|
||||
</svg>
|
||||
}
|
||||
label="SDK Flow"
|
||||
isActive={pathname === '/development/sdk-flow'}
|
||||
collapsed={collapsed}
|
||||
/>
|
||||
{state.companyProfile?.machineBuilder?.ceMarkingRequired && (
|
||||
<AdditionalModuleItem
|
||||
href="/sdk/iace"
|
||||
|
||||
@@ -102,6 +102,9 @@ const initialState: SDKState = {
|
||||
// IACE (Industrial AI Compliance Engine)
|
||||
iaceProjects: [],
|
||||
|
||||
// RAG Corpus Versioning
|
||||
ragCorpusStatus: null,
|
||||
|
||||
// Security
|
||||
sbom: null,
|
||||
securityIssues: [],
|
||||
|
||||
@@ -975,6 +975,22 @@ export interface SBOMDependency {
|
||||
to: string
|
||||
}
|
||||
|
||||
// RAG Corpus Versioning
|
||||
export interface RAGCorpusCollectionStatus {
|
||||
id: string
|
||||
current_version: string
|
||||
documents_count: number
|
||||
chunks_count: number
|
||||
regulations: string[]
|
||||
last_updated: string
|
||||
digest: string
|
||||
}
|
||||
|
||||
export interface RAGCorpusStatus {
|
||||
collections: Record<string, RAGCorpusCollectionStatus>
|
||||
fetchedAt: string
|
||||
}
|
||||
|
||||
export interface SBOM {
|
||||
format: 'CycloneDX' | 'SPDX'
|
||||
version: string
|
||||
@@ -1504,6 +1520,9 @@ export interface SDKState {
|
||||
// IACE (Industrial AI Compliance Engine)
|
||||
iaceProjects: IACEProjectSummary[]
|
||||
|
||||
// RAG Corpus Versioning
|
||||
ragCorpusStatus: RAGCorpusStatus | null
|
||||
|
||||
// Security
|
||||
sbom: SBOM | null
|
||||
securityIssues: SecurityIssue[]
|
||||
|
||||
@@ -62,6 +62,7 @@ func main() {
|
||||
dsgvoStore := dsgvo.NewStore(pool)
|
||||
uccaStore := ucca.NewStore(pool)
|
||||
escalationStore := ucca.NewEscalationStore(pool)
|
||||
corpusVersionStore := ucca.NewCorpusVersionStore(pool)
|
||||
roadmapStore := roadmap.NewStore(pool)
|
||||
workshopStore := workshop.NewStore(pool)
|
||||
portfolioStore := portfolio.NewStore(pool)
|
||||
@@ -120,6 +121,7 @@ func main() {
|
||||
vendorHandlers := handlers.NewVendorHandlers(vendorStore)
|
||||
iaceHandler := handlers.NewIACEHandler(iaceStore)
|
||||
trainingHandlers := handlers.NewTrainingHandlers(trainingStore, contentGenerator)
|
||||
ragHandlers := handlers.NewRAGHandlers(corpusVersionStore)
|
||||
|
||||
// Initialize middleware
|
||||
rbacMiddleware := rbac.NewMiddleware(rbacService, policyEngine)
|
||||
@@ -345,6 +347,15 @@ func main() {
|
||||
uccaRoutes.POST("/dsb-pool", escalationHandlers.AddDSBPoolMember)
|
||||
}
|
||||
|
||||
// RAG routes - Legal Corpus Search & Versioning
|
||||
ragRoutes := v1.Group("/rag")
|
||||
{
|
||||
ragRoutes.POST("/search", ragHandlers.Search)
|
||||
ragRoutes.GET("/regulations", ragHandlers.ListRegulations)
|
||||
ragRoutes.GET("/corpus-status", ragHandlers.CorpusStatus)
|
||||
ragRoutes.GET("/corpus-versions/:collection", ragHandlers.CorpusVersionHistory)
|
||||
}
|
||||
|
||||
// Roadmap routes - Compliance Implementation Roadmaps
|
||||
roadmapRoutes := v1.Group("/roadmaps")
|
||||
{
|
||||
|
||||
@@ -9,13 +9,15 @@ import (
|
||||
|
||||
// RAGHandlers handles RAG search API endpoints.
|
||||
type RAGHandlers struct {
|
||||
ragClient *ucca.LegalRAGClient
|
||||
ragClient *ucca.LegalRAGClient
|
||||
corpusVersionStore *ucca.CorpusVersionStore
|
||||
}
|
||||
|
||||
// NewRAGHandlers creates new RAG handlers.
|
||||
func NewRAGHandlers() *RAGHandlers {
|
||||
func NewRAGHandlers(corpusVersionStore *ucca.CorpusVersionStore) *RAGHandlers {
|
||||
return &RAGHandlers{
|
||||
ragClient: ucca.NewLegalRAGClient(),
|
||||
ragClient: ucca.NewLegalRAGClient(),
|
||||
corpusVersionStore: corpusVersionStore,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,3 +76,62 @@ func (h *RAGHandlers) ListRegulations(c *gin.Context) {
|
||||
"count": len(regs),
|
||||
})
|
||||
}
|
||||
|
||||
// CorpusStatus returns the current version status of all RAG collections.
|
||||
// GET /sdk/v1/rag/corpus-status
|
||||
func (h *RAGHandlers) CorpusStatus(c *gin.Context) {
|
||||
if h.corpusVersionStore == nil {
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "corpus version store not configured"})
|
||||
return
|
||||
}
|
||||
|
||||
versions, err := h.corpusVersionStore.GetAllLatestVersions(c.Request.Context())
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch corpus versions: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
collections := make(map[string]gin.H)
|
||||
for _, v := range versions {
|
||||
collections[v.CollectionName] = gin.H{
|
||||
"id": v.ID,
|
||||
"current_version": v.Version,
|
||||
"documents_count": v.DocumentsCount,
|
||||
"chunks_count": v.ChunksCount,
|
||||
"regulations": v.Regulations,
|
||||
"last_updated": v.CreatedAt,
|
||||
"digest": v.Digest,
|
||||
}
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"collections": collections,
|
||||
})
|
||||
}
|
||||
|
||||
// CorpusVersionHistory returns the version history for a specific collection.
|
||||
// GET /sdk/v1/rag/corpus-versions/:collection
|
||||
func (h *RAGHandlers) CorpusVersionHistory(c *gin.Context) {
|
||||
if h.corpusVersionStore == nil {
|
||||
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "corpus version store not configured"})
|
||||
return
|
||||
}
|
||||
|
||||
collection := c.Param("collection")
|
||||
if collection == "" {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "collection name required"})
|
||||
return
|
||||
}
|
||||
|
||||
versions, err := h.corpusVersionStore.ListCorpusVersions(c.Request.Context(), collection)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch corpus versions: " + err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"collection": collection,
|
||||
"versions": versions,
|
||||
"count": len(versions),
|
||||
})
|
||||
}
|
||||
|
||||
158
ai-compliance-sdk/internal/ucca/corpus_version.go
Normal file
158
ai-compliance-sdk/internal/ucca/corpus_version.go
Normal file
@@ -0,0 +1,158 @@
|
||||
package ucca
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
)
|
||||
|
||||
// CorpusVersion tracks a specific version of the RAG compliance corpus.
|
||||
type CorpusVersion struct {
|
||||
ID uuid.UUID `json:"id"`
|
||||
Version string `json:"version"` // "2026-03-02.1"
|
||||
CollectionName string `json:"collection_name"` // "bp_compliance_ce"
|
||||
DocumentsCount int `json:"documents_count"`
|
||||
ChunksCount int `json:"chunks_count"`
|
||||
Regulations []string `json:"regulations"` // ["eu_2016_679", ...]
|
||||
Digest string `json:"digest,omitempty"` // SHA256 over chunks
|
||||
IngestionSource string `json:"ingestion_source,omitempty"`
|
||||
Notes string `json:"notes,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
CreatedBy string `json:"created_by,omitempty"`
|
||||
}
|
||||
|
||||
// CorpusVersionStore handles corpus version persistence.
|
||||
type CorpusVersionStore struct {
|
||||
pool *pgxpool.Pool
|
||||
}
|
||||
|
||||
// NewCorpusVersionStore creates a new corpus version store.
|
||||
func NewCorpusVersionStore(pool *pgxpool.Pool) *CorpusVersionStore {
|
||||
return &CorpusVersionStore{pool: pool}
|
||||
}
|
||||
|
||||
// CreateCorpusVersion inserts a new corpus version record.
|
||||
func (s *CorpusVersionStore) CreateCorpusVersion(ctx context.Context, v *CorpusVersion) error {
|
||||
if v.ID == uuid.Nil {
|
||||
v.ID = uuid.New()
|
||||
}
|
||||
if v.CreatedAt.IsZero() {
|
||||
v.CreatedAt = time.Now().UTC()
|
||||
}
|
||||
|
||||
_, err := s.pool.Exec(ctx, `
|
||||
INSERT INTO compliance_corpus_versions (
|
||||
id, version, collection_name, documents_count, chunks_count,
|
||||
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
||||
`,
|
||||
v.ID, v.Version, v.CollectionName, v.DocumentsCount, v.ChunksCount,
|
||||
v.Regulations, v.Digest, v.IngestionSource, v.Notes, v.CreatedAt, v.CreatedBy,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetLatestCorpusVersion returns the most recent version for a collection.
|
||||
func (s *CorpusVersionStore) GetLatestCorpusVersion(ctx context.Context, collection string) (*CorpusVersion, error) {
|
||||
var v CorpusVersion
|
||||
err := s.pool.QueryRow(ctx, `
|
||||
SELECT id, version, collection_name, documents_count, chunks_count,
|
||||
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||
FROM compliance_corpus_versions
|
||||
WHERE collection_name = $1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
`, collection).Scan(
|
||||
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
|
||||
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
|
||||
)
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &v, nil
|
||||
}
|
||||
|
||||
// GetCorpusVersionByID retrieves a specific corpus version by ID.
|
||||
func (s *CorpusVersionStore) GetCorpusVersionByID(ctx context.Context, id uuid.UUID) (*CorpusVersion, error) {
|
||||
var v CorpusVersion
|
||||
err := s.pool.QueryRow(ctx, `
|
||||
SELECT id, version, collection_name, documents_count, chunks_count,
|
||||
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||
FROM compliance_corpus_versions
|
||||
WHERE id = $1
|
||||
`, id).Scan(
|
||||
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
|
||||
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
|
||||
)
|
||||
if err == pgx.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &v, nil
|
||||
}
|
||||
|
||||
// ListCorpusVersions returns all versions for a collection, newest first.
|
||||
func (s *CorpusVersionStore) ListCorpusVersions(ctx context.Context, collection string) ([]CorpusVersion, error) {
|
||||
rows, err := s.pool.Query(ctx, `
|
||||
SELECT id, version, collection_name, documents_count, chunks_count,
|
||||
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||
FROM compliance_corpus_versions
|
||||
WHERE collection_name = $1
|
||||
ORDER BY created_at DESC
|
||||
`, collection)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var versions []CorpusVersion
|
||||
for rows.Next() {
|
||||
var v CorpusVersion
|
||||
err := rows.Scan(
|
||||
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
|
||||
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
versions = append(versions, v)
|
||||
}
|
||||
return versions, nil
|
||||
}
|
||||
|
||||
// GetAllLatestVersions returns the latest version for every collection.
|
||||
func (s *CorpusVersionStore) GetAllLatestVersions(ctx context.Context) ([]CorpusVersion, error) {
|
||||
rows, err := s.pool.Query(ctx, `
|
||||
SELECT DISTINCT ON (collection_name)
|
||||
id, version, collection_name, documents_count, chunks_count,
|
||||
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||
FROM compliance_corpus_versions
|
||||
ORDER BY collection_name, created_at DESC
|
||||
`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var versions []CorpusVersion
|
||||
for rows.Next() {
|
||||
var v CorpusVersion
|
||||
err := rows.Scan(
|
||||
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
|
||||
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
versions = append(versions, v)
|
||||
}
|
||||
return versions, nil
|
||||
}
|
||||
@@ -471,6 +471,10 @@ type Assessment struct {
|
||||
Art22Risk bool `json:"art22_risk"`
|
||||
TrainingAllowed TrainingAllowed `json:"training_allowed"`
|
||||
|
||||
// Corpus Versioning (RAG)
|
||||
CorpusVersionID *uuid.UUID `json:"corpus_version_id,omitempty"`
|
||||
CorpusVersion string `json:"corpus_version,omitempty"`
|
||||
|
||||
// LLM Explanation (optional)
|
||||
ExplanationText *string `json:"explanation_text,omitempty"`
|
||||
ExplanationGeneratedAt *time.Time `json:"explanation_generated_at,omitempty"`
|
||||
|
||||
@@ -52,6 +52,7 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
|
||||
triggered_rules, required_controls, recommended_architecture,
|
||||
forbidden_patterns, example_matches,
|
||||
dsfa_recommended, art22_risk, training_allowed,
|
||||
corpus_version_id, corpus_version,
|
||||
explanation_text, explanation_generated_at, explanation_model,
|
||||
domain, created_at, updated_at, created_by
|
||||
) VALUES (
|
||||
@@ -61,8 +62,9 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
|
||||
$14, $15, $16,
|
||||
$17, $18,
|
||||
$19, $20, $21,
|
||||
$22, $23, $24,
|
||||
$25, $26, $27, $28
|
||||
$22, $23,
|
||||
$24, $25, $26,
|
||||
$27, $28, $29, $30
|
||||
)
|
||||
`,
|
||||
a.ID, a.TenantID, a.NamespaceID, a.Title, a.PolicyVersion, a.Status,
|
||||
@@ -71,6 +73,7 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
|
||||
triggeredRules, requiredControls, recommendedArchitecture,
|
||||
forbiddenPatterns, exampleMatches,
|
||||
a.DSFARecommended, a.Art22Risk, string(a.TrainingAllowed),
|
||||
a.CorpusVersionID, a.CorpusVersion,
|
||||
a.ExplanationText, a.ExplanationGeneratedAt, a.ExplanationModel,
|
||||
string(a.Domain), a.CreatedAt, a.UpdatedAt, a.CreatedBy,
|
||||
)
|
||||
@@ -92,6 +95,7 @@ func (s *Store) GetAssessment(ctx context.Context, id uuid.UUID) (*Assessment, e
|
||||
triggered_rules, required_controls, recommended_architecture,
|
||||
forbidden_patterns, example_matches,
|
||||
dsfa_recommended, art22_risk, training_allowed,
|
||||
corpus_version_id, corpus_version,
|
||||
explanation_text, explanation_generated_at, explanation_model,
|
||||
domain, created_at, updated_at, created_by
|
||||
FROM ucca_assessments WHERE id = $1
|
||||
@@ -102,6 +106,7 @@ func (s *Store) GetAssessment(ctx context.Context, id uuid.UUID) (*Assessment, e
|
||||
&triggeredRules, &requiredControls, &recommendedArchitecture,
|
||||
&forbiddenPatterns, &exampleMatches,
|
||||
&a.DSFARecommended, &a.Art22Risk, &trainingAllowed,
|
||||
&a.CorpusVersionID, &a.CorpusVersion,
|
||||
&a.ExplanationText, &a.ExplanationGeneratedAt, &a.ExplanationModel,
|
||||
&domain, &a.CreatedAt, &a.UpdatedAt, &a.CreatedBy,
|
||||
)
|
||||
@@ -141,6 +146,7 @@ func (s *Store) ListAssessments(ctx context.Context, tenantID uuid.UUID, filters
|
||||
triggered_rules, required_controls, recommended_architecture,
|
||||
forbidden_patterns, example_matches,
|
||||
dsfa_recommended, art22_risk, training_allowed,
|
||||
corpus_version_id, corpus_version,
|
||||
explanation_text, explanation_generated_at, explanation_model,
|
||||
domain, created_at, updated_at, created_by
|
||||
FROM ucca_assessments WHERE tenant_id = $1`
|
||||
@@ -194,6 +200,7 @@ func (s *Store) ListAssessments(ctx context.Context, tenantID uuid.UUID, filters
|
||||
&triggeredRules, &requiredControls, &recommendedArchitecture,
|
||||
&forbiddenPatterns, &exampleMatches,
|
||||
&a.DSFARecommended, &a.Art22Risk, &trainingAllowed,
|
||||
&a.CorpusVersionID, &a.CorpusVersion,
|
||||
&a.ExplanationText, &a.ExplanationGeneratedAt, &a.ExplanationModel,
|
||||
&domain, &a.CreatedAt, &a.UpdatedAt, &a.CreatedBy,
|
||||
)
|
||||
|
||||
35
ai-compliance-sdk/migrations/017_corpus_versioning.sql
Normal file
35
ai-compliance-sdk/migrations/017_corpus_versioning.sql
Normal file
@@ -0,0 +1,35 @@
|
||||
-- =============================================================================
|
||||
-- Migration 017: RAG Corpus Versioning
|
||||
--
|
||||
-- Tracks versions of the RAG corpus so assessments can record which
|
||||
-- corpus version they were evaluated against. Enables staleness detection
|
||||
-- and re-evaluation recommendations.
|
||||
-- =============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS compliance_corpus_versions (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
version VARCHAR(50) NOT NULL, -- "2026-03-02.1"
|
||||
collection_name VARCHAR(100) NOT NULL, -- "bp_compliance_ce"
|
||||
documents_count INT NOT NULL DEFAULT 0,
|
||||
chunks_count INT NOT NULL DEFAULT 0,
|
||||
regulations TEXT[], -- {"eu_2016_679", "eu_2024_1689"}
|
||||
digest VARCHAR(128), -- SHA256 over all chunks
|
||||
ingestion_source VARCHAR(200), -- "ingest-legal-corpus.sh"
|
||||
notes TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_by VARCHAR(100)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_corpus_versions_collection
|
||||
ON compliance_corpus_versions(collection_name);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_corpus_versions_latest
|
||||
ON compliance_corpus_versions(collection_name, created_at DESC);
|
||||
|
||||
-- Add corpus_version_id to ucca_assessments
|
||||
ALTER TABLE ucca_assessments
|
||||
ADD COLUMN IF NOT EXISTS corpus_version_id UUID REFERENCES compliance_corpus_versions(id),
|
||||
ADD COLUMN IF NOT EXISTS corpus_version VARCHAR(50);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_ucca_assessments_corpus_version
|
||||
ON ucca_assessments(corpus_version_id);
|
||||
503
backend-compliance/compliance/api/source_policy_router.py
Normal file
503
backend-compliance/compliance/api/source_policy_router.py
Normal file
@@ -0,0 +1,503 @@
|
||||
"""
|
||||
Source Policy Router — Manages allowed compliance data sources.
|
||||
|
||||
Controls which legal sources the RAG corpus may use,
|
||||
operations matrix, PII rules, and provides audit trail.
|
||||
|
||||
Endpoints:
|
||||
GET /api/v1/admin/sources — List all sources
|
||||
POST /api/v1/admin/sources — Add new source
|
||||
GET /api/v1/admin/sources/{id} — Get source by ID
|
||||
PUT /api/v1/admin/sources/{id} — Update source
|
||||
DELETE /api/v1/admin/sources/{id} — Remove source
|
||||
GET /api/v1/admin/operations-matrix — Operations matrix
|
||||
PUT /api/v1/admin/operations/{id} — Update operation
|
||||
GET /api/v1/admin/pii-rules — List PII rules
|
||||
POST /api/v1/admin/pii-rules — Create PII rule
|
||||
PUT /api/v1/admin/pii-rules/{id} — Update PII rule
|
||||
DELETE /api/v1/admin/pii-rules/{id} — Delete PII rule
|
||||
GET /api/v1/admin/policy-audit — Audit trail
|
||||
GET /api/v1/admin/policy-stats — Dashboard statistics
|
||||
GET /api/v1/admin/compliance-report — Compliance report
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Depends, Query
|
||||
from pydantic import BaseModel, Field
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from database import get_db
|
||||
from compliance.db.source_policy_models import (
|
||||
AllowedSourceDB,
|
||||
SourceOperationDB,
|
||||
PIIRuleDB,
|
||||
SourcePolicyAuditDB,
|
||||
)
|
||||
|
||||
|
||||
router = APIRouter(prefix="/v1/admin", tags=["source-policy"])
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Pydantic Schemas
|
||||
# =============================================================================
|
||||
|
||||
class SourceCreate(BaseModel):
|
||||
domain: str
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
license: Optional[str] = None
|
||||
legal_basis: Optional[str] = None
|
||||
trust_boost: float = Field(default=0.5, ge=0.0, le=1.0)
|
||||
source_type: str = "legal"
|
||||
active: bool = True
|
||||
metadata: Optional[dict] = None
|
||||
|
||||
|
||||
class SourceUpdate(BaseModel):
|
||||
domain: Optional[str] = None
|
||||
name: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
license: Optional[str] = None
|
||||
legal_basis: Optional[str] = None
|
||||
trust_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
||||
source_type: Optional[str] = None
|
||||
active: Optional[bool] = None
|
||||
metadata: Optional[dict] = None
|
||||
|
||||
|
||||
class SourceResponse(BaseModel):
|
||||
id: str
|
||||
domain: str
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
license: Optional[str] = None
|
||||
legal_basis: Optional[str] = None
|
||||
trust_boost: float
|
||||
source_type: str
|
||||
active: bool
|
||||
metadata: Optional[dict] = None
|
||||
created_at: str
|
||||
updated_at: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class OperationUpdate(BaseModel):
|
||||
allowed: bool
|
||||
conditions: Optional[str] = None
|
||||
|
||||
|
||||
class PIIRuleCreate(BaseModel):
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
pattern: Optional[str] = None
|
||||
category: str
|
||||
action: str = "mask"
|
||||
active: bool = True
|
||||
|
||||
|
||||
class PIIRuleUpdate(BaseModel):
|
||||
name: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
pattern: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
action: Optional[str] = None
|
||||
active: Optional[bool] = None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Helper: Audit logging
|
||||
# =============================================================================
|
||||
|
||||
def _log_audit(db: Session, action: str, entity_type: str, entity_id, old_values=None, new_values=None):
|
||||
audit = SourcePolicyAuditDB(
|
||||
action=action,
|
||||
entity_type=entity_type,
|
||||
entity_id=entity_id,
|
||||
old_values=old_values,
|
||||
new_values=new_values,
|
||||
user_id="system",
|
||||
)
|
||||
db.add(audit)
|
||||
|
||||
|
||||
def _source_to_dict(source: AllowedSourceDB) -> dict:
|
||||
return {
|
||||
"id": str(source.id),
|
||||
"domain": source.domain,
|
||||
"name": source.name,
|
||||
"description": source.description,
|
||||
"license": source.license,
|
||||
"legal_basis": source.legal_basis,
|
||||
"trust_boost": source.trust_boost,
|
||||
"source_type": source.source_type,
|
||||
"active": source.active,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Sources CRUD
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/sources")
|
||||
async def list_sources(
|
||||
active_only: bool = Query(False),
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""List all allowed sources."""
|
||||
query = db.query(AllowedSourceDB)
|
||||
if active_only:
|
||||
query = query.filter(AllowedSourceDB.active == True)
|
||||
sources = query.order_by(AllowedSourceDB.name).all()
|
||||
return {
|
||||
"sources": [
|
||||
{
|
||||
"id": str(s.id),
|
||||
"domain": s.domain,
|
||||
"name": s.name,
|
||||
"description": s.description,
|
||||
"license": s.license,
|
||||
"legal_basis": s.legal_basis,
|
||||
"trust_boost": s.trust_boost,
|
||||
"source_type": s.source_type,
|
||||
"active": s.active,
|
||||
"metadata": s.metadata_,
|
||||
"created_at": s.created_at.isoformat() if s.created_at else None,
|
||||
"updated_at": s.updated_at.isoformat() if s.updated_at else None,
|
||||
}
|
||||
for s in sources
|
||||
],
|
||||
"count": len(sources),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/sources")
|
||||
async def create_source(
|
||||
data: SourceCreate,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Add a new allowed source."""
|
||||
existing = db.query(AllowedSourceDB).filter(AllowedSourceDB.domain == data.domain).first()
|
||||
if existing:
|
||||
raise HTTPException(status_code=409, detail=f"Source with domain '{data.domain}' already exists")
|
||||
|
||||
source = AllowedSourceDB(
|
||||
domain=data.domain,
|
||||
name=data.name,
|
||||
description=data.description,
|
||||
license=data.license,
|
||||
legal_basis=data.legal_basis,
|
||||
trust_boost=data.trust_boost,
|
||||
source_type=data.source_type,
|
||||
active=data.active,
|
||||
metadata_=data.metadata,
|
||||
)
|
||||
db.add(source)
|
||||
_log_audit(db, "create", "source", source.id, new_values=_source_to_dict(source))
|
||||
db.commit()
|
||||
db.refresh(source)
|
||||
|
||||
return {
|
||||
"id": str(source.id),
|
||||
"domain": source.domain,
|
||||
"name": source.name,
|
||||
"created_at": source.created_at.isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@router.get("/sources/{source_id}")
|
||||
async def get_source(source_id: str, db: Session = Depends(get_db)):
|
||||
"""Get a specific source."""
|
||||
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail="Source not found")
|
||||
return {
|
||||
"id": str(source.id),
|
||||
"domain": source.domain,
|
||||
"name": source.name,
|
||||
"description": source.description,
|
||||
"license": source.license,
|
||||
"legal_basis": source.legal_basis,
|
||||
"trust_boost": source.trust_boost,
|
||||
"source_type": source.source_type,
|
||||
"active": source.active,
|
||||
"metadata": source.metadata_,
|
||||
"created_at": source.created_at.isoformat() if source.created_at else None,
|
||||
"updated_at": source.updated_at.isoformat() if source.updated_at else None,
|
||||
}
|
||||
|
||||
|
||||
@router.put("/sources/{source_id}")
|
||||
async def update_source(
|
||||
source_id: str,
|
||||
data: SourceUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Update an existing source."""
|
||||
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail="Source not found")
|
||||
|
||||
old_values = _source_to_dict(source)
|
||||
update_data = data.model_dump(exclude_unset=True)
|
||||
|
||||
# Rename metadata to metadata_ for the DB column
|
||||
if "metadata" in update_data:
|
||||
update_data["metadata_"] = update_data.pop("metadata")
|
||||
|
||||
for key, value in update_data.items():
|
||||
setattr(source, key, value)
|
||||
|
||||
_log_audit(db, "update", "source", source.id, old_values=old_values, new_values=update_data)
|
||||
db.commit()
|
||||
db.refresh(source)
|
||||
|
||||
return {"status": "updated", "id": str(source.id)}
|
||||
|
||||
|
||||
@router.delete("/sources/{source_id}")
|
||||
async def delete_source(source_id: str, db: Session = Depends(get_db)):
|
||||
"""Remove an allowed source."""
|
||||
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
|
||||
if not source:
|
||||
raise HTTPException(status_code=404, detail="Source not found")
|
||||
|
||||
old_values = _source_to_dict(source)
|
||||
_log_audit(db, "delete", "source", source.id, old_values=old_values)
|
||||
|
||||
# Also delete associated operations
|
||||
db.query(SourceOperationDB).filter(SourceOperationDB.source_id == source_id).delete()
|
||||
db.delete(source)
|
||||
db.commit()
|
||||
|
||||
return {"status": "deleted", "id": source_id}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Operations Matrix
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/operations-matrix")
|
||||
async def get_operations_matrix(db: Session = Depends(get_db)):
|
||||
"""Get the full operations matrix."""
|
||||
operations = db.query(SourceOperationDB).all()
|
||||
return {
|
||||
"operations": [
|
||||
{
|
||||
"id": str(op.id),
|
||||
"source_id": str(op.source_id),
|
||||
"operation": op.operation,
|
||||
"allowed": op.allowed,
|
||||
"conditions": op.conditions,
|
||||
}
|
||||
for op in operations
|
||||
],
|
||||
"count": len(operations),
|
||||
}
|
||||
|
||||
|
||||
@router.put("/operations/{operation_id}")
|
||||
async def update_operation(
|
||||
operation_id: str,
|
||||
data: OperationUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Update an operation in the matrix."""
|
||||
op = db.query(SourceOperationDB).filter(SourceOperationDB.id == operation_id).first()
|
||||
if not op:
|
||||
raise HTTPException(status_code=404, detail="Operation not found")
|
||||
|
||||
op.allowed = data.allowed
|
||||
if data.conditions is not None:
|
||||
op.conditions = data.conditions
|
||||
|
||||
_log_audit(db, "update", "operation", op.id, new_values={"allowed": data.allowed})
|
||||
db.commit()
|
||||
|
||||
return {"status": "updated", "id": str(op.id)}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PII Rules
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/pii-rules")
|
||||
async def list_pii_rules(db: Session = Depends(get_db)):
|
||||
"""List all PII rules."""
|
||||
rules = db.query(PIIRuleDB).order_by(PIIRuleDB.category, PIIRuleDB.name).all()
|
||||
return {
|
||||
"rules": [
|
||||
{
|
||||
"id": str(r.id),
|
||||
"name": r.name,
|
||||
"description": r.description,
|
||||
"pattern": r.pattern,
|
||||
"category": r.category,
|
||||
"action": r.action,
|
||||
"active": r.active,
|
||||
"created_at": r.created_at.isoformat() if r.created_at else None,
|
||||
}
|
||||
for r in rules
|
||||
],
|
||||
"count": len(rules),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/pii-rules")
|
||||
async def create_pii_rule(data: PIIRuleCreate, db: Session = Depends(get_db)):
|
||||
"""Create a new PII rule."""
|
||||
rule = PIIRuleDB(
|
||||
name=data.name,
|
||||
description=data.description,
|
||||
pattern=data.pattern,
|
||||
category=data.category,
|
||||
action=data.action,
|
||||
active=data.active,
|
||||
)
|
||||
db.add(rule)
|
||||
_log_audit(db, "create", "pii_rule", rule.id, new_values={"name": data.name, "category": data.category})
|
||||
db.commit()
|
||||
db.refresh(rule)
|
||||
|
||||
return {"id": str(rule.id), "name": rule.name}
|
||||
|
||||
|
||||
@router.put("/pii-rules/{rule_id}")
|
||||
async def update_pii_rule(rule_id: str, data: PIIRuleUpdate, db: Session = Depends(get_db)):
|
||||
"""Update a PII rule."""
|
||||
rule = db.query(PIIRuleDB).filter(PIIRuleDB.id == rule_id).first()
|
||||
if not rule:
|
||||
raise HTTPException(status_code=404, detail="PII rule not found")
|
||||
|
||||
update_data = data.model_dump(exclude_unset=True)
|
||||
for key, value in update_data.items():
|
||||
setattr(rule, key, value)
|
||||
|
||||
_log_audit(db, "update", "pii_rule", rule.id, new_values=update_data)
|
||||
db.commit()
|
||||
|
||||
return {"status": "updated", "id": str(rule.id)}
|
||||
|
||||
|
||||
@router.delete("/pii-rules/{rule_id}")
|
||||
async def delete_pii_rule(rule_id: str, db: Session = Depends(get_db)):
|
||||
"""Delete a PII rule."""
|
||||
rule = db.query(PIIRuleDB).filter(PIIRuleDB.id == rule_id).first()
|
||||
if not rule:
|
||||
raise HTTPException(status_code=404, detail="PII rule not found")
|
||||
|
||||
_log_audit(db, "delete", "pii_rule", rule.id, old_values={"name": rule.name, "category": rule.category})
|
||||
db.delete(rule)
|
||||
db.commit()
|
||||
|
||||
return {"status": "deleted", "id": rule_id}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Audit Trail
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/policy-audit")
|
||||
async def get_policy_audit(
|
||||
limit: int = Query(50, ge=1, le=500),
|
||||
offset: int = Query(0, ge=0),
|
||||
entity_type: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
):
|
||||
"""Get the audit trail for source policy changes."""
|
||||
query = db.query(SourcePolicyAuditDB)
|
||||
if entity_type:
|
||||
query = query.filter(SourcePolicyAuditDB.entity_type == entity_type)
|
||||
|
||||
total = query.count()
|
||||
entries = query.order_by(SourcePolicyAuditDB.created_at.desc()).offset(offset).limit(limit).all()
|
||||
|
||||
return {
|
||||
"entries": [
|
||||
{
|
||||
"id": str(e.id),
|
||||
"action": e.action,
|
||||
"entity_type": e.entity_type,
|
||||
"entity_id": str(e.entity_id) if e.entity_id else None,
|
||||
"old_values": e.old_values,
|
||||
"new_values": e.new_values,
|
||||
"user_id": e.user_id,
|
||||
"created_at": e.created_at.isoformat() if e.created_at else None,
|
||||
}
|
||||
for e in entries
|
||||
],
|
||||
"total": total,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Dashboard Statistics
|
||||
# =============================================================================
|
||||
|
||||
@router.get("/policy-stats")
|
||||
async def get_policy_stats(db: Session = Depends(get_db)):
|
||||
"""Get dashboard statistics for source policy."""
|
||||
total_sources = db.query(AllowedSourceDB).count()
|
||||
active_sources = db.query(AllowedSourceDB).filter(AllowedSourceDB.active == True).count()
|
||||
pii_rules = db.query(PIIRuleDB).filter(PIIRuleDB.active == True).count()
|
||||
|
||||
# Count audit entries from today
|
||||
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
blocked_today = db.query(SourcePolicyAuditDB).filter(
|
||||
SourcePolicyAuditDB.action == "delete",
|
||||
SourcePolicyAuditDB.created_at >= today_start,
|
||||
).count()
|
||||
|
||||
blocked_total = db.query(SourcePolicyAuditDB).filter(
|
||||
SourcePolicyAuditDB.action == "delete",
|
||||
).count()
|
||||
|
||||
return {
|
||||
"active_policies": active_sources,
|
||||
"allowed_sources": total_sources,
|
||||
"pii_rules": pii_rules,
|
||||
"blocked_today": blocked_today,
|
||||
"blocked_total": blocked_total,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/compliance-report")
|
||||
async def get_compliance_report(db: Session = Depends(get_db)):
|
||||
"""Generate a compliance report for source policies."""
|
||||
sources = db.query(AllowedSourceDB).filter(AllowedSourceDB.active == True).all()
|
||||
pii_rules = db.query(PIIRuleDB).filter(PIIRuleDB.active == True).all()
|
||||
|
||||
return {
|
||||
"report_date": datetime.utcnow().isoformat(),
|
||||
"summary": {
|
||||
"active_sources": len(sources),
|
||||
"active_pii_rules": len(pii_rules),
|
||||
"source_types": list(set(s.source_type for s in sources)),
|
||||
"licenses": list(set(s.license for s in sources if s.license)),
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"domain": s.domain,
|
||||
"name": s.name,
|
||||
"license": s.license,
|
||||
"legal_basis": s.legal_basis,
|
||||
"trust_boost": s.trust_boost,
|
||||
}
|
||||
for s in sources
|
||||
],
|
||||
"pii_rules": [
|
||||
{
|
||||
"name": r.name,
|
||||
"category": r.category,
|
||||
"action": r.action,
|
||||
}
|
||||
for r in pii_rules
|
||||
],
|
||||
}
|
||||
105
backend-compliance/compliance/db/source_policy_models.py
Normal file
105
backend-compliance/compliance/db/source_policy_models.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
SQLAlchemy models for Source Policy Management.
|
||||
|
||||
Tables:
|
||||
- compliance_allowed_sources: Whitelisted data sources for RAG corpus
|
||||
- compliance_source_operations: Operations matrix for source data flows
|
||||
- compliance_pii_rules: PII detection/masking rules for sources
|
||||
- compliance_source_policy_audit: Audit trail for source policy changes
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import (
|
||||
Column, String, Text, Boolean, DateTime, Float, JSON, Index
|
||||
)
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
|
||||
from classroom_engine.database import Base
|
||||
|
||||
|
||||
class AllowedSourceDB(Base):
|
||||
"""Whitelisted data source for compliance RAG corpus."""
|
||||
|
||||
__tablename__ = 'compliance_allowed_sources'
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
domain = Column(String(255), unique=True, nullable=False)
|
||||
name = Column(String(255), nullable=False)
|
||||
description = Column(Text, nullable=True)
|
||||
license = Column(String(100), nullable=True) # DL-DE-BY-2.0, CC-BY, etc.
|
||||
legal_basis = Column(String(200), nullable=True) # §5 UrhG, etc.
|
||||
trust_boost = Column(Float, default=0.5)
|
||||
source_type = Column(String(50), default='legal') # legal, guidance, template
|
||||
active = Column(Boolean, default=True)
|
||||
metadata_ = Column('metadata', JSON, nullable=True)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
|
||||
|
||||
__table_args__ = (
|
||||
Index('idx_allowed_sources_domain', 'domain'),
|
||||
Index('idx_allowed_sources_active', 'active'),
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<AllowedSource {self.domain}: {self.name}>"
|
||||
|
||||
|
||||
class SourceOperationDB(Base):
|
||||
"""Operations matrix entry for source data flows."""
|
||||
|
||||
__tablename__ = 'compliance_source_operations'
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
source_id = Column(UUID(as_uuid=True), nullable=False)
|
||||
operation = Column(String(50), nullable=False) # ingest, search, export, share
|
||||
allowed = Column(Boolean, default=True)
|
||||
conditions = Column(Text, nullable=True) # Conditions for this operation
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
|
||||
|
||||
__table_args__ = (
|
||||
Index('idx_source_operations_source', 'source_id'),
|
||||
)
|
||||
|
||||
|
||||
class PIIRuleDB(Base):
|
||||
"""PII detection and masking rule for compliance sources."""
|
||||
|
||||
__tablename__ = 'compliance_pii_rules'
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
name = Column(String(255), nullable=False)
|
||||
description = Column(Text, nullable=True)
|
||||
pattern = Column(Text, nullable=True) # Regex pattern
|
||||
category = Column(String(50), nullable=False) # email, phone, name, address, etc.
|
||||
action = Column(String(20), default='mask') # mask, redact, flag
|
||||
active = Column(Boolean, default=True)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
|
||||
|
||||
__table_args__ = (
|
||||
Index('idx_pii_rules_category', 'category'),
|
||||
Index('idx_pii_rules_active', 'active'),
|
||||
)
|
||||
|
||||
|
||||
class SourcePolicyAuditDB(Base):
|
||||
"""Audit trail for source policy changes."""
|
||||
|
||||
__tablename__ = 'compliance_source_policy_audit'
|
||||
|
||||
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||
action = Column(String(20), nullable=False) # create, update, delete
|
||||
entity_type = Column(String(50), nullable=False) # source, operation, pii_rule
|
||||
entity_id = Column(UUID(as_uuid=True), nullable=True)
|
||||
old_values = Column(JSON, nullable=True)
|
||||
new_values = Column(JSON, nullable=True)
|
||||
user_id = Column(String(100), nullable=True)
|
||||
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||
|
||||
__table_args__ = (
|
||||
Index('idx_source_audit_entity', 'entity_type', 'entity_id'),
|
||||
Index('idx_source_audit_created', 'created_at'),
|
||||
)
|
||||
@@ -21,6 +21,9 @@ from dsr_admin_api import router as dsr_admin_router, templates_router as dsr_te
|
||||
# Compliance framework sub-package
|
||||
from compliance.api import router as compliance_framework_router
|
||||
|
||||
# Source Policy
|
||||
from compliance.api.source_policy_router import router as source_policy_router
|
||||
|
||||
# Middleware
|
||||
from middleware import (
|
||||
RequestIDMiddleware,
|
||||
@@ -85,6 +88,9 @@ app.include_router(dsr_templates_router, prefix="/api")
|
||||
# Compliance Framework (regulations, controls, evidence, risks, audits, ISMS)
|
||||
app.include_router(compliance_framework_router, prefix="/api")
|
||||
|
||||
# Source Policy (allowed sources, PII rules, audit)
|
||||
app.include_router(source_policy_router, prefix="/api")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
73
backend-compliance/migrations/001_source_policy.sql
Normal file
73
backend-compliance/migrations/001_source_policy.sql
Normal file
@@ -0,0 +1,73 @@
|
||||
-- =============================================================================
|
||||
-- Migration 001: Source Policy Tables
|
||||
--
|
||||
-- Tables for managing allowed compliance data sources, operations matrix,
|
||||
-- PII rules, and audit trail.
|
||||
-- =============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS compliance_allowed_sources (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
domain VARCHAR(255) UNIQUE NOT NULL,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
description TEXT,
|
||||
license VARCHAR(100),
|
||||
legal_basis VARCHAR(200),
|
||||
trust_boost FLOAT DEFAULT 0.5,
|
||||
source_type VARCHAR(50) DEFAULT 'legal',
|
||||
active BOOLEAN DEFAULT true,
|
||||
metadata JSON,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_allowed_sources_domain ON compliance_allowed_sources(domain);
|
||||
CREATE INDEX IF NOT EXISTS idx_allowed_sources_active ON compliance_allowed_sources(active);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS compliance_source_operations (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
source_id UUID NOT NULL REFERENCES compliance_allowed_sources(id) ON DELETE CASCADE,
|
||||
operation VARCHAR(50) NOT NULL,
|
||||
allowed BOOLEAN DEFAULT true,
|
||||
conditions TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_source_operations_source ON compliance_source_operations(source_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS compliance_pii_rules (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
name VARCHAR(255) NOT NULL,
|
||||
description TEXT,
|
||||
pattern TEXT,
|
||||
category VARCHAR(50) NOT NULL,
|
||||
action VARCHAR(20) DEFAULT 'mask',
|
||||
active BOOLEAN DEFAULT true,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_pii_rules_category ON compliance_pii_rules(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_pii_rules_active ON compliance_pii_rules(active);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS compliance_source_policy_audit (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
action VARCHAR(20) NOT NULL,
|
||||
entity_type VARCHAR(50) NOT NULL,
|
||||
entity_id UUID,
|
||||
old_values JSON,
|
||||
new_values JSON,
|
||||
user_id VARCHAR(100),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_source_audit_entity ON compliance_source_policy_audit(entity_type, entity_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_source_audit_created ON compliance_source_policy_audit(created_at);
|
||||
|
||||
-- Seed default PII rules
|
||||
INSERT INTO compliance_pii_rules (name, category, pattern, action, description) VALUES
|
||||
('E-Mail-Adresse', 'email', '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', 'mask', 'E-Mail-Adressen erkennen und maskieren'),
|
||||
('Telefonnummer', 'phone', '(\+49|0)[0-9\s/-]{8,15}', 'mask', 'Deutsche Telefonnummern erkennen'),
|
||||
('IBAN', 'financial', 'DE[0-9]{2}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{2}', 'redact', 'Deutsche IBAN-Nummern erkennen und entfernen'),
|
||||
('Postadresse', 'address', '[0-9]{5}\s+[A-Z][a-z]', 'flag', 'Postleitzahlen mit Ortsnamen markieren')
|
||||
ON CONFLICT DO NOTHING;
|
||||
971
scripts/ingest-legal-corpus.sh
Executable file
971
scripts/ingest-legal-corpus.sh
Executable file
@@ -0,0 +1,971 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# BreakPilot Compliance — RAG Legal Corpus Ingestion
|
||||
#
|
||||
# Laedt 23 freie Rechtsquellen herunter und ingestiert sie in Qdrant
|
||||
# via die Core RAG-API (Port 8097).
|
||||
#
|
||||
# Ausfuehrung auf dem Mac Mini:
|
||||
# ~/rag-ingestion/ingest-legal-corpus.sh [--skip-download] [--only PHASE]
|
||||
#
|
||||
# Phasen: download, gesetze, eu, templates, datenschutz, verify
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
# --- Configuration -----------------------------------------------------------
|
||||
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}"
|
||||
RAG_URL="https://localhost:8097/api/v1/documents/upload"
|
||||
QDRANT_URL="http://localhost:6333"
|
||||
SDK_URL="${SDK_URL:-https://localhost:8093}"
|
||||
DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
|
||||
CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
|
||||
|
||||
# Counters
|
||||
UPLOADED=0
|
||||
FAILED=0
|
||||
SKIPPED=0
|
||||
|
||||
# --- CLI Args ----------------------------------------------------------------
|
||||
SKIP_DOWNLOAD=false
|
||||
ONLY_PHASE=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--skip-download) SKIP_DOWNLOAD=true; shift ;;
|
||||
--only) ONLY_PHASE="$2"; shift 2 ;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [--skip-download] [--only PHASE]"
|
||||
echo "Phases: download, gesetze, eu, templates, datenschutz, verify, version"
|
||||
exit 0
|
||||
;;
|
||||
*) echo "Unknown option: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# --- Helpers -----------------------------------------------------------------
|
||||
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
||||
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
|
||||
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
|
||||
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
|
||||
|
||||
upload_file() {
|
||||
local file="$1"
|
||||
local collection="$2"
|
||||
local data_type="$3"
|
||||
local use_case="$4"
|
||||
local year="$5"
|
||||
local metadata_json="$6"
|
||||
local label="${7:-$(basename "$file")}"
|
||||
|
||||
if [[ ! -f "$file" ]]; then
|
||||
warn "File not found: $file"
|
||||
FAILED=$((FAILED + 1))
|
||||
return 1
|
||||
fi
|
||||
|
||||
local filesize
|
||||
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
|
||||
if [[ "$filesize" -lt 100 ]]; then
|
||||
warn "File too small (${filesize}B), skipping: $label"
|
||||
SKIPPED=$((SKIPPED + 1))
|
||||
return 1
|
||||
fi
|
||||
|
||||
log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)"
|
||||
|
||||
local response
|
||||
response=$(curl $CURL_OPTS -X POST "$RAG_URL" \
|
||||
-F "file=@${file}" \
|
||||
-F "collection=${collection}" \
|
||||
-F "data_type=${data_type}" \
|
||||
-F "use_case=${use_case}" \
|
||||
-F "year=${year}" \
|
||||
-F "chunk_strategy=recursive" \
|
||||
-F "chunk_size=512" \
|
||||
-F "chunk_overlap=50" \
|
||||
-F "metadata_json=${metadata_json}" \
|
||||
2>/dev/null) || true
|
||||
|
||||
if echo "$response" | grep -q '"chunks_count"'; then
|
||||
local chunks
|
||||
chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
|
||||
ok "$label → $chunks chunks"
|
||||
UPLOADED=$((UPLOADED + 1))
|
||||
elif echo "$response" | grep -q '"vectors_indexed"'; then
|
||||
local vectors
|
||||
vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
|
||||
ok "$label → $vectors vectors"
|
||||
UPLOADED=$((UPLOADED + 1))
|
||||
else
|
||||
fail "Upload failed: $label"
|
||||
fail "Response: $response"
|
||||
FAILED=$((FAILED + 1))
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
clone_repo() {
|
||||
local url="$1"
|
||||
local target="$2"
|
||||
|
||||
if [[ -d "$target" ]]; then
|
||||
log "Repo exists: $target (skipping clone)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Cloning: $url"
|
||||
git clone --depth 1 "$url" "$target" 2>/dev/null || {
|
||||
warn "Clone failed: $url"
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
download_pdf() {
|
||||
local url="$1"
|
||||
local target="$2"
|
||||
|
||||
if [[ -f "$target" ]]; then
|
||||
log "PDF exists: $(basename "$target") (skipping)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Downloading: $(basename "$target")"
|
||||
curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || {
|
||||
warn "Download failed: $url"
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
# Extract text from gesetze-im-internet.de HTML page
|
||||
extract_gesetz_html() {
|
||||
local url="$1"
|
||||
local output="$2"
|
||||
local label="$3"
|
||||
|
||||
if [[ -f "$output" ]]; then
|
||||
log "Text exists: $(basename "$output") (skipping)"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Extracting: $label from gesetze-im-internet.de"
|
||||
curl $CURL_OPTS -L "$url" 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys, codecs
|
||||
|
||||
# gesetze-im-internet.de uses ISO-8859-1 encoding
|
||||
sys.stdin = codecs.getreader('iso-8859-1')(sys.stdin.buffer)
|
||||
|
||||
from html.parser import HTMLParser
|
||||
|
||||
class TextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.text = []
|
||||
self.in_content = False
|
||||
self.skip = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
attrs_dict = dict(attrs)
|
||||
if tag == 'div' and 'jnhtml' in attrs_dict.get('class', ''):
|
||||
self.in_content = True
|
||||
if tag in ('script', 'style', 'nav', 'header', 'footer'):
|
||||
self.skip = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ('script', 'style', 'nav', 'header', 'footer'):
|
||||
self.skip = False
|
||||
if tag in ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'li'):
|
||||
self.text.append('\n')
|
||||
|
||||
def handle_data(self, data):
|
||||
if not self.skip:
|
||||
self.text.append(data)
|
||||
|
||||
parser = TextExtractor()
|
||||
parser.feed(sys.stdin.read())
|
||||
print(''.join(parser.text).strip())
|
||||
" > "$output" || {
|
||||
warn "Extraction failed: $label"
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
# Concatenate Markdown files from bundestag/gesetze repo for a specific law
|
||||
concat_bundestag_gesetz() {
|
||||
local gesetz_dir="$1"
|
||||
local output="$2"
|
||||
local label="$3"
|
||||
|
||||
if [[ ! -d "$gesetz_dir" ]]; then
|
||||
warn "Gesetz directory not found: $gesetz_dir"
|
||||
return 0
|
||||
fi
|
||||
|
||||
log "Concatenating: $label"
|
||||
{
|
||||
echo "# $label"
|
||||
echo ""
|
||||
# Sort by paragraph number for correct ordering
|
||||
find "$gesetz_dir" -name "*.md" -type f | sort | while read -r f; do
|
||||
cat "$f"
|
||||
echo ""
|
||||
echo "---"
|
||||
echo ""
|
||||
done
|
||||
} > "$output"
|
||||
}
|
||||
|
||||
collection_count() {
|
||||
local col="$1"
|
||||
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
||||
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE A: Downloads
|
||||
# =============================================================================
|
||||
phase_download() {
|
||||
log "=========================================="
|
||||
log "PHASE A: Downloads (PDFs + Git-Repos)"
|
||||
log "=========================================="
|
||||
|
||||
mkdir -p "$WORK_DIR"/{pdfs,repos,texts}
|
||||
|
||||
# --- A1: EUR-Lex PDFs ---
|
||||
log "--- EUR-Lex PDFs ---"
|
||||
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \
|
||||
"$WORK_DIR/pdfs/dsa_2022_2065.pdf"
|
||||
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \
|
||||
"$WORK_DIR/pdfs/eprivacy_2002_58.pdf"
|
||||
|
||||
download_pdf \
|
||||
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \
|
||||
"$WORK_DIR/pdfs/scc_2021_914.pdf"
|
||||
|
||||
# --- A2: Deutsche Gesetze (Einzelparagraphen) ---
|
||||
log "--- Deutsche Gesetze (Einzelparagraphen) ---"
|
||||
|
||||
extract_gesetz_html \
|
||||
"https://www.gesetze-im-internet.de/ddg/__5.html" \
|
||||
"$WORK_DIR/texts/ddg_5.txt" \
|
||||
"DDG § 5 (Impressum)"
|
||||
|
||||
# TDDDG heisst auf gesetze-im-internet.de noch "ttdsg"
|
||||
extract_gesetz_html \
|
||||
"https://www.gesetze-im-internet.de/ttdsg/__25.html" \
|
||||
"$WORK_DIR/texts/tdddg_25.txt" \
|
||||
"TDDDG § 25 (Cookies)"
|
||||
|
||||
extract_gesetz_html \
|
||||
"https://www.gesetze-im-internet.de/urhg/__5.html" \
|
||||
"$WORK_DIR/texts/urhg_5.txt" \
|
||||
"UrhG § 5 (Amtliche Werke)"
|
||||
|
||||
# EGBGB Art. 246a § 1 (enthaelt Verweis auf Muster-Widerrufsbelehrung)
|
||||
extract_gesetz_html \
|
||||
"https://www.gesetze-im-internet.de/bgbeg/art_246a__1.html" \
|
||||
"$WORK_DIR/texts/egbgb_widerruf.txt" \
|
||||
"EGBGB Muster-Widerrufsbelehrung"
|
||||
|
||||
# --- A3: Git-Repos ---
|
||||
log "--- Git-Repos ---"
|
||||
|
||||
clone_repo "https://github.com/bundestag/gesetze.git" \
|
||||
"$WORK_DIR/repos/bundestag-gesetze"
|
||||
|
||||
clone_repo "https://github.com/github/site-policy.git" \
|
||||
"$WORK_DIR/repos/github-site-policy"
|
||||
|
||||
clone_repo "https://github.com/opengovfoundation/site-policy.git" \
|
||||
"$WORK_DIR/repos/opengov-site-policy"
|
||||
|
||||
clone_repo "https://github.com/creativecommons/cc-legal-tools-data.git" \
|
||||
"$WORK_DIR/repos/cc-legal-tools"
|
||||
|
||||
clone_repo "https://github.com/oprvc/oprvc.github.io.git" \
|
||||
"$WORK_DIR/repos/oprvc"
|
||||
|
||||
clone_repo "https://github.com/webflorist/privacy-policy-text.git" \
|
||||
"$WORK_DIR/repos/webflorist"
|
||||
|
||||
clone_repo "https://github.com/Tempest-Solutions-Company/privacy-policy-generator.git" \
|
||||
"$WORK_DIR/repos/tempest-privacy" || true
|
||||
|
||||
clone_repo "https://github.com/Tempest-Solutions-Company/terms-of-service-generator.git" \
|
||||
"$WORK_DIR/repos/tempest-tos" || true
|
||||
|
||||
clone_repo "https://github.com/Tempest-Solutions-Company/cookie-banner-consent-solution.git" \
|
||||
"$WORK_DIR/repos/tempest-cookie" || true
|
||||
|
||||
clone_repo "https://github.com/orestbida/cookieconsent.git" \
|
||||
"$WORK_DIR/repos/cookieconsent" || true
|
||||
|
||||
# CommonPaper hat separate Repos pro Vertragstyp
|
||||
clone_repo "https://github.com/CommonPaper/CSA.git" \
|
||||
"$WORK_DIR/repos/common-paper-csa" || true
|
||||
clone_repo "https://github.com/CommonPaper/SLA.git" \
|
||||
"$WORK_DIR/repos/common-paper-sla" || true
|
||||
clone_repo "https://github.com/CommonPaper/PSA.git" \
|
||||
"$WORK_DIR/repos/common-paper-psa" || true
|
||||
|
||||
# OpenCode.de (Datennutzungsklauseln) - try HTTPS
|
||||
clone_repo "https://gitlab.opencode.de/wernerth/datennutzungsklauseln-muster.git" \
|
||||
"$WORK_DIR/repos/datennutzungsklauseln" || true
|
||||
|
||||
# --- A4: EDPB/EDPS PDFs (verifizierte URLs) ---
|
||||
log "--- EDPB/EDPS Guidance PDFs ---"
|
||||
|
||||
# EDPB Guidelines 05/2020 on Consent
|
||||
download_pdf \
|
||||
"https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf" \
|
||||
"$WORK_DIR/pdfs/edpb_consent_guidelines.pdf"
|
||||
|
||||
# EDPB Guidelines 4/2019 Data Protection by Design and Default
|
||||
download_pdf \
|
||||
"https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf" \
|
||||
"$WORK_DIR/pdfs/edpb_privacy_by_design.pdf"
|
||||
|
||||
# EDPB Guidelines 03/2022 Dark Patterns
|
||||
download_pdf \
|
||||
"https://www.edpb.europa.eu/system/files/2023-02/edpb_03-2022_guidelines_on_deceptive_design_patterns_in_social_media_platform_interfaces_v2_en_0.pdf" \
|
||||
"$WORK_DIR/pdfs/edpb_dark_patterns.pdf"
|
||||
|
||||
# EDPB Guidelines 8/2020 Social Media Targeting
|
||||
download_pdf \
|
||||
"https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf" \
|
||||
"$WORK_DIR/pdfs/edpb_social_media_targeting.pdf"
|
||||
|
||||
# EDPB Cookie Banner Taskforce Report (Jan 2023)
|
||||
download_pdf \
|
||||
"https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf" \
|
||||
"$WORK_DIR/pdfs/edpb_cookie_banner_taskforce.pdf"
|
||||
|
||||
# EDPB Guidelines 2/2023 ePrivacy Art. 5(3) Technical Scope
|
||||
download_pdf \
|
||||
"https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_v2_en_0.pdf" \
|
||||
"$WORK_DIR/pdfs/edpb_eprivacy_art53.pdf"
|
||||
|
||||
# EDPB Guidelines 1/2024 Legitimate Interest
|
||||
download_pdf \
|
||||
"https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimateinterest_en.pdf" \
|
||||
"$WORK_DIR/pdfs/edpb_legitimate_interest.pdf"
|
||||
|
||||
# EDPB DPO Coordinated Enforcement Report 2024
|
||||
download_pdf \
|
||||
"https://www.edpb.europa.eu/system/files/2024-01/edpb_report_20240116_cef_dpo_en.pdf" \
|
||||
"$WORK_DIR/pdfs/edpb_dpo_report.pdf"
|
||||
|
||||
# EDPS GenAI Orientations (June 2024)
|
||||
download_pdf \
|
||||
"https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf" \
|
||||
"$WORK_DIR/pdfs/edps_generative_ai.pdf"
|
||||
|
||||
# EDPS Digital Ethics Report (2018)
|
||||
download_pdf \
|
||||
"https://edps.europa.eu/sites/edp/files/publication/18-01-25_eag_report_en.pdf" \
|
||||
"$WORK_DIR/pdfs/edps_digital_ethics.pdf"
|
||||
|
||||
# --- A5: Text-Extraktion aus Repos ---
|
||||
log "--- Text-Extraktion aus Repos ---"
|
||||
|
||||
# Bundestag/gesetze: Verfuegbare Gesetze (Repo ist teilweise veraltet)
|
||||
# DDG, TDDDG, EGBGB fehlen im Repo - nur BGB, UrhG, TMG vorhanden
|
||||
local -a bundestag_gesetze=(
|
||||
"b/bgb:BGB"
|
||||
"u/urhg:UrhG"
|
||||
"t/tmg:TMG"
|
||||
)
|
||||
for entry in "${bundestag_gesetze[@]}"; do
|
||||
local path="${entry%%:*}"
|
||||
local label="${entry##*:}"
|
||||
local gesetz_dir="$WORK_DIR/repos/bundestag-gesetze/$path"
|
||||
if [[ -d "$gesetz_dir" ]]; then
|
||||
local name
|
||||
name=$(echo "$label" | tr '[:upper:]' '[:lower:]')
|
||||
concat_bundestag_gesetz "$gesetz_dir" \
|
||||
"$WORK_DIR/texts/bundestag_${name}_komplett.txt" \
|
||||
"$label (komplett)"
|
||||
else
|
||||
warn "Bundestag Gesetz nicht gefunden: $gesetz_dir"
|
||||
fi
|
||||
done
|
||||
|
||||
log "Download phase complete."
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE B: Deutsche Gesetze → bp_compliance_gesetze
|
||||
# =============================================================================
|
||||
phase_gesetze() {
|
||||
log "=========================================="
|
||||
log "PHASE B: Deutsche Gesetze → bp_compliance_gesetze"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_compliance_gesetze"
|
||||
local before
|
||||
before=$(collection_count "$col")
|
||||
log "Collection $col: $before chunks (before)"
|
||||
|
||||
# B1: Einzelparagraphen
|
||||
upload_file "$WORK_DIR/texts/ddg_5.txt" "$col" "compliance" "legal_reference" "2024" \
|
||||
'{"regulation_id":"ddg_5","regulation_name_de":"Digitale-Dienste-Gesetz § 5","category":"impressum","license":"public_law","source":"gesetze-im-internet.de"}' \
|
||||
"DDG § 5 (Impressumspflicht)"
|
||||
|
||||
upload_file "$WORK_DIR/texts/tdddg_25.txt" "$col" "compliance" "legal_reference" "2024" \
|
||||
'{"regulation_id":"tdddg_25","regulation_name_de":"TDDDG § 25","category":"cookies","license":"public_law","source":"gesetze-im-internet.de"}' \
|
||||
"TDDDG § 25 (Cookies/Endgeraetezugriff)"
|
||||
|
||||
upload_file "$WORK_DIR/texts/urhg_5.txt" "$col" "compliance" "legal_reference" "2024" \
|
||||
'{"regulation_id":"urhg_5","regulation_name_de":"UrhG § 5","category":"urheberrecht","license":"public_law","source":"gesetze-im-internet.de"}' \
|
||||
"UrhG § 5 (Amtliche Werke)"
|
||||
|
||||
upload_file "$WORK_DIR/texts/egbgb_widerruf.txt" "$col" "compliance" "legal_reference" "2024" \
|
||||
'{"regulation_id":"egbgb_widerruf","regulation_name_de":"EGBGB Muster-Widerrufsbelehrung","category":"widerruf","license":"public_law","source":"gesetze-im-internet.de"}' \
|
||||
"EGBGB Muster-Widerrufsbelehrung"
|
||||
|
||||
# B2: Bundestag/gesetze (komplett)
|
||||
local -a bundestag_upload=(
|
||||
"bgb:BGB:Buergerliches Gesetzbuch"
|
||||
"urhg:UrhG:Urheberrechtsgesetz"
|
||||
"tmg:TMG:Telemediengesetz"
|
||||
)
|
||||
for entry in "${bundestag_upload[@]}"; do
|
||||
local gesetz="${entry%%:*}"
|
||||
local rest="${entry#*:}"
|
||||
local label="${rest%%:*}"
|
||||
local fullname="${rest#*:}"
|
||||
local file="$WORK_DIR/texts/bundestag_${gesetz}_komplett.txt"
|
||||
if [[ -f "$file" ]]; then
|
||||
upload_file "$file" "$col" "compliance" "legal_reference" "2024" \
|
||||
"{\"regulation_id\":\"${gesetz}_komplett\",\"regulation_name_de\":\"$fullname ($label komplett)\",\"category\":\"volltext\",\"license\":\"unlicense\",\"source\":\"github.com/bundestag/gesetze\"}" \
|
||||
"$label komplett (Bundestag-Repo)"
|
||||
fi
|
||||
done
|
||||
|
||||
local after
|
||||
after=$(collection_count "$col")
|
||||
log "Collection $col: $before → $after chunks"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE C: EU-Rechtstexte → bp_compliance_ce
|
||||
# =============================================================================
|
||||
phase_eu() {
|
||||
log "=========================================="
|
||||
log "PHASE C: EU-Rechtstexte → bp_compliance_ce"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_compliance_ce"
|
||||
local before
|
||||
before=$(collection_count "$col")
|
||||
log "Collection $col: $before chunks (before)"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/dsa_2022_2065.pdf" "$col" "compliance_ce" "legal_reference" "2022" \
|
||||
'{"regulation_id":"eu_2022_2065","regulation_name_de":"Digital Services Act (DSA)","regulation_name_en":"Digital Services Act","regulation_short":"DSA","category":"plattformregulierung","celex":"32022R2065","source":"eur-lex","license":"public_law"}' \
|
||||
"Digital Services Act (EU) 2022/2065"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" "$col" "compliance_ce" "legal_reference" "2002" \
|
||||
'{"regulation_id":"eu_2002_58","regulation_name_de":"ePrivacy-Richtlinie","regulation_name_en":"ePrivacy Directive","regulation_short":"ePrivacy","category":"datenschutz","celex":"32002L0058","source":"eur-lex","license":"public_law"}' \
|
||||
"ePrivacy-Richtlinie 2002/58/EC"
|
||||
|
||||
upload_file "$WORK_DIR/pdfs/scc_2021_914.pdf" "$col" "compliance_ce" "legal_reference" "2021" \
|
||||
'{"regulation_id":"eu_2021_914","regulation_name_de":"Standardvertragsklauseln (SCC)","regulation_name_en":"Standard Contractual Clauses","regulation_short":"SCC","category":"datentransfer","celex":"32021D0914","source":"eur-lex","license":"public_law"}' \
|
||||
"Standardvertragsklauseln (EU) 2021/914"
|
||||
|
||||
local after
|
||||
after=$(collection_count "$col")
|
||||
log "Collection $col: $before → $after chunks"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE D: Templates/Textbausteine → bp_legal_templates
|
||||
# =============================================================================
|
||||
phase_templates() {
|
||||
log "=========================================="
|
||||
log "PHASE D: Templates → bp_legal_templates"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_legal_templates"
|
||||
local before
|
||||
before=$(collection_count "$col")
|
||||
log "Collection $col: $before chunks (before)"
|
||||
|
||||
# --- D1: GitHub Site Policy (CC0) ---
|
||||
local repo="$WORK_DIR/repos/github-site-policy"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- GitHub Site Policy ---"
|
||||
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f" .md)
|
||||
local doc_type="policy"
|
||||
case "$basename" in
|
||||
*terms*|*tos*|*service*) doc_type="tos" ;;
|
||||
*privacy*|*data*) doc_type="privacy_policy" ;;
|
||||
*dmca*|*copyright*) doc_type="dmca" ;;
|
||||
*acceptable*|*use*) doc_type="acceptable_use" ;;
|
||||
esac
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"github_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/github/site-policy\",\"filename\":\"$basename\"}" \
|
||||
"GitHub: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
# --- D2: OpenGov Site Policy (CC0) ---
|
||||
repo="$WORK_DIR/repos/opengov-site-policy"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- OpenGov Site Policy ---"
|
||||
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f" .md)
|
||||
local doc_type="policy"
|
||||
case "$basename" in
|
||||
*terms*|*tos*) doc_type="tos" ;;
|
||||
*privacy*) doc_type="privacy_policy" ;;
|
||||
esac
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"opengov_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/opengovfoundation/site-policy\",\"filename\":\"$basename\"}" \
|
||||
"OpenGov: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
# --- D3: Creative Commons Legal Tools (CC0) ---
|
||||
repo="$WORK_DIR/repos/cc-legal-tools"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- CC Legal Tools (ausgewaehlte Lizenztexte) ---"
|
||||
# Only ingest the main license deeds (DE legalcode where available, else EN)
|
||||
for license_dir in "$repo"/legalcode/de/CC0_1.0 "$repo"/legalcode/de/CC-BY_4.0 "$repo"/legalcode/de/CC-BY-SA_4.0; do
|
||||
if [[ -d "$license_dir" ]]; then
|
||||
find "$license_dir" -name "*.html" -o -name "*.txt" -o -name "*.md" 2>/dev/null | head -3 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
|
||||
"CC License: $basename"
|
||||
done
|
||||
fi
|
||||
done
|
||||
# Fallback: try top-level legalcode files
|
||||
find "$repo"/legalcode -maxdepth 2 -name "*4.0*legalcode*de*" -type f 2>/dev/null | head -5 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
|
||||
"CC License: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
# --- D4: opr.vc DSGVO-Mustertexte (CC0) ---
|
||||
repo="$WORK_DIR/repos/oprvc"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- opr.vc DSGVO-Mustertexte ---"
|
||||
# Look for German privacy/DSGVO content
|
||||
find "$repo" \( -name "*.md" -o -name "*.html" -o -name "*.txt" \) \
|
||||
-not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
|
||||
| grep -iE "(datenschutz|privacy|dsgvo|gdpr|impressum)" \
|
||||
| head -20 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
|
||||
"opr.vc: $basename"
|
||||
done
|
||||
# If no specific files found, try all markdown files
|
||||
if [[ $(find "$repo" \( -name "*.md" -o -name "*.html" \) -not -path "*/.git/*" -not -name "README.md" | grep -ciE "(datenschutz|privacy|dsgvo|gdpr)" 2>/dev/null) -eq 0 ]]; then
|
||||
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" | head -10 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
|
||||
"opr.vc: $basename"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- D5: webflorist/privacy-policy-text (MIT) ---
|
||||
repo="$WORK_DIR/repos/webflorist"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- webflorist Privacy Policy Text ---"
|
||||
# Look for JSON/text building blocks (German)
|
||||
find "$repo" \( -name "*.json" -o -name "*.txt" -o -name "*.md" -o -name "*.php" \) \
|
||||
-not -path "*/.git/*" -not -path "*/node_modules/*" -not -name "package*.json" \
|
||||
-not -name "composer.json" -not -name "README.md" 2>/dev/null \
|
||||
| head -20 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"webflorist\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/webflorist/privacy-policy-text\",\"filename\":\"$basename\"}" \
|
||||
"webflorist: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
# --- D6: Tempest Privacy Policy Generator (MIT) ---
|
||||
repo="$WORK_DIR/repos/tempest-privacy"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- Tempest Privacy Policy Generator ---"
|
||||
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
||||
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
||||
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
||||
| head -15 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"tempest_privacy\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/privacy-policy-generator\",\"filename\":\"$basename\"}" \
|
||||
"Tempest Privacy: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
# --- D7: Tempest Terms of Service Generator (MIT) ---
|
||||
repo="$WORK_DIR/repos/tempest-tos"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- Tempest Terms of Service Generator ---"
|
||||
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
||||
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
||||
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
||||
| head -15 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"tempest_tos\",\"doc_type\":\"tos\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/terms-of-service-generator\",\"filename\":\"$basename\"}" \
|
||||
"Tempest ToS: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
# --- D8: Tempest Cookie Banner (MIT) ---
|
||||
repo="$WORK_DIR/repos/tempest-cookie"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- Tempest Cookie Banner ---"
|
||||
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
||||
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
||||
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
||||
| head -15 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"tempest_cookie\",\"doc_type\":\"cookie_banner\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/cookie-banner-consent-solution\",\"filename\":\"$basename\"}" \
|
||||
"Tempest Cookie: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
# --- D9: CookieConsent (orestbida) - UI Strings (MIT) ---
|
||||
repo="$WORK_DIR/repos/cookieconsent"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- CookieConsent UI Strings ---"
|
||||
# Look for translation/language files
|
||||
find "$repo" -path "*/translations/*" -o -path "*/languages/*" -o -path "*/i18n/*" -o -path "*/locales/*" 2>/dev/null \
|
||||
| grep -iE "\.(json|js|ts)$" | head -10 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
|
||||
"CookieConsent: $basename"
|
||||
done
|
||||
# Also check for example configs
|
||||
find "$repo" -name "*.md" -path "*/docs/*" 2>/dev/null | head -5 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f")
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
|
||||
"CookieConsent Docs: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
# --- D10: Common Paper (CC BY 4.0) ---
|
||||
log "--- Common Paper Standards ---"
|
||||
local -a cp_repos=(
|
||||
"common-paper-csa:saas_contract:CSA"
|
||||
"common-paper-sla:sla:SLA"
|
||||
"common-paper-psa:psa:PSA"
|
||||
)
|
||||
for entry in "${cp_repos[@]}"; do
|
||||
local cp_dir="${entry%%:*}"
|
||||
local rest="${entry#*:}"
|
||||
local cp_doc_type="${rest%%:*}"
|
||||
local cp_label="${rest#*:}"
|
||||
repo="$WORK_DIR/repos/$cp_dir"
|
||||
if [[ -d "$repo" ]]; then
|
||||
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" \
|
||||
-not -name "CONTRIBUTING.md" -not -name "CHANGELOG.md" -not -name "CODE_OF_CONDUCT.md" 2>/dev/null \
|
||||
| head -10 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f" .md)
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"common_paper\",\"doc_type\":\"$cp_doc_type\",\"license\":\"cc_by_4\",\"attribution\":\"Common Paper Inc., licensed under CC BY 4.0\",\"source\":\"github.com/CommonPaper/$cp_label\",\"filename\":\"$basename\"}" \
|
||||
"CommonPaper $cp_label: $basename"
|
||||
done
|
||||
fi
|
||||
done
|
||||
|
||||
# --- D11: Datennutzungsklauseln (CC BY 4.0) ---
|
||||
repo="$WORK_DIR/repos/datennutzungsklauseln"
|
||||
if [[ -d "$repo" ]]; then
|
||||
log "--- Datennutzungsklauseln ---"
|
||||
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
|
||||
| head -15 | while read -r f; do
|
||||
local basename
|
||||
basename=$(basename "$f" .md)
|
||||
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||
"{\"source_id\":\"datennutzungsklauseln\",\"doc_type\":\"data_clause\",\"license\":\"cc_by_4\",\"attribution\":\"OpenCode.de, lizenziert unter CC BY 4.0\",\"source\":\"gitlab.opencode.de/wernerth/datennutzungsklauseln-muster\",\"filename\":\"$basename\"}" \
|
||||
"Datennutzungsklausel: $basename"
|
||||
done
|
||||
fi
|
||||
|
||||
local after
|
||||
after=$(collection_count "$col")
|
||||
log "Collection $col: $before → $after chunks"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz
|
||||
# =============================================================================
|
||||
phase_datenschutz() {
|
||||
log "=========================================="
|
||||
log "PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz"
|
||||
log "=========================================="
|
||||
|
||||
local col="bp_compliance_datenschutz"
|
||||
local before
|
||||
before=$(collection_count "$col")
|
||||
log "Collection $col: $before chunks (before)"
|
||||
|
||||
# EDPB Guidelines
|
||||
for pdf in "$WORK_DIR"/pdfs/edpb_*.pdf; do
|
||||
if [[ -f "$pdf" ]]; then
|
||||
local basename
|
||||
basename=$(basename "$pdf" .pdf)
|
||||
local guideline_name="${basename#edpb_}"
|
||||
guideline_name="${guideline_name//_/ }"
|
||||
upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
||||
"{\"source_id\":\"edpb\",\"doc_type\":\"guidance\",\"guideline_name\":\"$guideline_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Board (EDPB)\",\"source\":\"edpb.europa.eu\"}" \
|
||||
"EDPB: $guideline_name"
|
||||
fi
|
||||
done
|
||||
|
||||
# EDPS Guidance
|
||||
for pdf in "$WORK_DIR"/pdfs/edps_*.pdf; do
|
||||
if [[ -f "$pdf" ]]; then
|
||||
local basename
|
||||
basename=$(basename "$pdf" .pdf)
|
||||
local guidance_name="${basename#edps_}"
|
||||
guidance_name="${guidance_name//_/ }"
|
||||
upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
||||
"{\"source_id\":\"edps\",\"doc_type\":\"guidance\",\"guidance_name\":\"$guidance_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Supervisor (EDPS)\",\"source\":\"edps.europa.eu\"}" \
|
||||
"EDPS: $guidance_name"
|
||||
fi
|
||||
done
|
||||
|
||||
local after
|
||||
after=$(collection_count "$col")
|
||||
log "Collection $col: $before → $after chunks"
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE F: Verifizierung
|
||||
# =============================================================================
|
||||
phase_verify() {
|
||||
log "=========================================="
|
||||
log "PHASE F: Verifizierung"
|
||||
log "=========================================="
|
||||
|
||||
echo ""
|
||||
echo "=== Collection Stats ==="
|
||||
for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
|
||||
local count
|
||||
count=$(collection_count "$col")
|
||||
printf " %-30s %s chunks\n" "$col" "$count"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=== Test-Suchen ==="
|
||||
|
||||
log "Suche: 'Impressumspflicht digitale Dienste' in bp_compliance_gesetze"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"Impressumspflicht digitale Dienste","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
|
||||
log "Suche: 'Cookie Einwilligung' in bp_compliance_ce"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"Cookie Einwilligung ePrivacy","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
|
||||
log "Suche: 'Privacy Policy Template GDPR' in bp_legal_templates"
|
||||
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":"Privacy Policy Template GDPR","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
||||
| python3 -c "
|
||||
import sys,json
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
results = data.get('results', [])
|
||||
print(f' Treffer: {len(results)}')
|
||||
for r in results[:3]:
|
||||
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
||||
except: print(' (parse error)')
|
||||
" 2>/dev/null || echo " (search failed)"
|
||||
|
||||
echo ""
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# PHASE G: Corpus Version Registration
|
||||
# =============================================================================
|
||||
phase_register_version() {
|
||||
log "=========================================="
|
||||
log "PHASE G: Corpus Version Registration"
|
||||
log "=========================================="
|
||||
|
||||
local today
|
||||
today=$(date '+%Y-%m-%d')
|
||||
|
||||
for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
|
||||
local count
|
||||
count=$(collection_count "$col")
|
||||
|
||||
if [[ "$count" == "?" || "$count" == "0" ]]; then
|
||||
warn "Skipping version for $col (count=$count)"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Determine next version number for today
|
||||
local existing_count
|
||||
existing_count=$(psql "$DB_URL" -tAc \
|
||||
"SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \
|
||||
2>/dev/null || echo "0")
|
||||
local seq=$((existing_count + 1))
|
||||
local version="${today}.${seq}"
|
||||
|
||||
# Get regulations list based on collection
|
||||
local regs=""
|
||||
case "$col" in
|
||||
bp_compliance_ce)
|
||||
regs='{eu_2022_2065,eu_2002_58,eu_2021_914}'
|
||||
;;
|
||||
bp_compliance_gesetze)
|
||||
regs='{ddg_5,tdddg_25,urhg_5,egbgb_widerruf,bgb_komplett,urhg_komplett,tmg_komplett}'
|
||||
;;
|
||||
bp_legal_templates)
|
||||
regs='{github_site_policy,opengov_site_policy,cc_legal_tools,common_paper,webflorist,tempest,cookieconsent}'
|
||||
;;
|
||||
bp_compliance_datenschutz)
|
||||
regs='{edpb_consent,edpb_privacy_by_design,edpb_dark_patterns,edpb_social_media,edpb_cookie_banner,edps_generative_ai,edps_digital_ethics}'
|
||||
;;
|
||||
esac
|
||||
|
||||
# Compute digest from Qdrant collection info
|
||||
local digest
|
||||
digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
||||
| python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \
|
||||
2>/dev/null || echo "")
|
||||
|
||||
log "Registering version $version for $col ($count chunks)"
|
||||
|
||||
psql "$DB_URL" -c "
|
||||
INSERT INTO compliance_corpus_versions
|
||||
(version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by)
|
||||
VALUES
|
||||
('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-legal-corpus.sh', 'system')
|
||||
" 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration failed for $col (DB not available?)"
|
||||
done
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# MAIN
|
||||
# =============================================================================
|
||||
main() {
|
||||
log "=========================================="
|
||||
log "BreakPilot Legal Corpus Ingestion"
|
||||
log "=========================================="
|
||||
log "Work dir: $WORK_DIR"
|
||||
log "RAG API: $RAG_URL"
|
||||
log "Qdrant: $QDRANT_URL"
|
||||
echo ""
|
||||
|
||||
# Check RAG API is reachable
|
||||
if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file"; then
|
||||
fail "RAG API not reachable at $RAG_URL"
|
||||
exit 1
|
||||
fi
|
||||
ok "RAG API reachable"
|
||||
|
||||
# Check Qdrant
|
||||
if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then
|
||||
fail "Qdrant not reachable at $QDRANT_URL"
|
||||
exit 1
|
||||
fi
|
||||
ok "Qdrant reachable"
|
||||
echo ""
|
||||
|
||||
# Run phases
|
||||
if [[ -n "$ONLY_PHASE" ]]; then
|
||||
case "$ONLY_PHASE" in
|
||||
download) phase_download ;;
|
||||
gesetze) phase_gesetze ;;
|
||||
eu) phase_eu ;;
|
||||
templates) phase_templates ;;
|
||||
datenschutz) phase_datenschutz ;;
|
||||
verify) phase_verify ;;
|
||||
version) phase_register_version ;;
|
||||
*) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
|
||||
esac
|
||||
else
|
||||
if [[ "$SKIP_DOWNLOAD" != "true" ]]; then
|
||||
phase_download
|
||||
else
|
||||
log "Skipping download phase (--skip-download)"
|
||||
fi
|
||||
echo ""
|
||||
phase_gesetze
|
||||
echo ""
|
||||
phase_eu
|
||||
echo ""
|
||||
phase_templates
|
||||
echo ""
|
||||
phase_datenschutz
|
||||
echo ""
|
||||
phase_verify
|
||||
echo ""
|
||||
phase_register_version
|
||||
fi
|
||||
|
||||
# Summary
|
||||
echo ""
|
||||
log "=========================================="
|
||||
log "ERGEBNIS"
|
||||
log "=========================================="
|
||||
log "Uploaded: $UPLOADED"
|
||||
log "Failed: $FAILED"
|
||||
log "Skipped: $SKIPPED"
|
||||
log "=========================================="
|
||||
|
||||
if [[ $FAILED -gt 0 ]]; then
|
||||
warn "$FAILED uploads fehlgeschlagen!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ok "Ingestion abgeschlossen!"
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Reference in New Issue
Block a user