feat: add RAG corpus versioning and source policy backend
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 34s
CI / test-python-backend-compliance (push) Successful in 32s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 18s
All checks were successful
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-ai-compliance (push) Successful in 34s
CI / test-python-backend-compliance (push) Successful in 32s
CI / test-python-document-crawler (push) Successful in 23s
CI / test-python-dsms-gateway (push) Successful in 18s
Part 1 — RAG Corpus Versioning: - New DB table compliance_corpus_versions (migration 017) - Go CorpusVersionStore with CRUD operations - Assessment struct extended with corpus_version_id - API endpoints: GET /rag/corpus-status, /rag/corpus-versions/:collection - RAG routes (search, regulations) now registered in main.go - Ingestion script registers corpus versions after each run - Frontend staleness badge in SDK sidebar Part 3 — Source Policy Backend: - New FastAPI router with CRUD for allowed sources, PII rules, operations matrix, audit trail, stats, and compliance report - SQLAlchemy models for all source policy tables (migration 001) - Frontend API base corrected from edu-search:8088/8089 to backend-compliance:8002/api Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,8 +3,8 @@
|
|||||||
/**
|
/**
|
||||||
* Source Policy Management Page (SDK Version)
|
* Source Policy Management Page (SDK Version)
|
||||||
*
|
*
|
||||||
* Whitelist-based data source management for edu-search-service.
|
* Whitelist-based data source management for compliance RAG corpus.
|
||||||
* For auditors: Full audit trail for all changes.
|
* Controls which legal sources may be used, PII rules, and audit trail.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { useState, useEffect } from 'react'
|
import { useState, useEffect } from 'react'
|
||||||
@@ -15,14 +15,14 @@ import { OperationsMatrixTab } from '@/components/sdk/source-policy/OperationsMa
|
|||||||
import { PIIRulesTab } from '@/components/sdk/source-policy/PIIRulesTab'
|
import { PIIRulesTab } from '@/components/sdk/source-policy/PIIRulesTab'
|
||||||
import { AuditTab } from '@/components/sdk/source-policy/AuditTab'
|
import { AuditTab } from '@/components/sdk/source-policy/AuditTab'
|
||||||
|
|
||||||
// API base URL for edu-search-service
|
// API base URL for backend-compliance
|
||||||
const getApiBase = () => {
|
const getApiBase = () => {
|
||||||
if (typeof window === 'undefined') return 'http://localhost:8088'
|
if (typeof window === 'undefined') return 'http://localhost:8002/api'
|
||||||
const hostname = window.location.hostname
|
const hostname = window.location.hostname
|
||||||
if (hostname === 'localhost' || hostname === '127.0.0.1') {
|
if (hostname === 'localhost' || hostname === '127.0.0.1') {
|
||||||
return 'http://localhost:8088'
|
return 'http://localhost:8002/api'
|
||||||
}
|
}
|
||||||
return `https://${hostname}:8089`
|
return `https://${hostname}:8002/api`
|
||||||
}
|
}
|
||||||
|
|
||||||
interface PolicyStats {
|
interface PolicyStats {
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import {
|
|||||||
getStepsForPackage,
|
getStepsForPackage,
|
||||||
type SDKPackageId,
|
type SDKPackageId,
|
||||||
type SDKStep,
|
type SDKStep,
|
||||||
|
type RAGCorpusStatus,
|
||||||
} from '@/lib/sdk'
|
} from '@/lib/sdk'
|
||||||
|
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
@@ -288,6 +289,41 @@ interface SDKSidebarProps {
|
|||||||
onCollapsedChange?: (collapsed: boolean) => void
|
onCollapsedChange?: (collapsed: boolean) => void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// CORPUS STALENESS INFO
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
function CorpusStalenessInfo({ ragCorpusStatus }: { ragCorpusStatus: RAGCorpusStatus }) {
|
||||||
|
const collections = ragCorpusStatus.collections
|
||||||
|
const collectionNames = Object.keys(collections)
|
||||||
|
if (collectionNames.length === 0) return null
|
||||||
|
|
||||||
|
// Check if corpus was updated after the last fetch (simplified: show last update time)
|
||||||
|
const lastUpdated = collectionNames.reduce((latest, name) => {
|
||||||
|
const updated = new Date(collections[name].last_updated)
|
||||||
|
return updated > latest ? updated : latest
|
||||||
|
}, new Date(0))
|
||||||
|
|
||||||
|
const daysSinceUpdate = Math.floor((Date.now() - lastUpdated.getTime()) / (1000 * 60 * 60 * 24))
|
||||||
|
const totalChunks = collectionNames.reduce((sum, name) => sum + collections[name].chunks_count, 0)
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="px-4 py-2 border-b border-gray-100">
|
||||||
|
<div className="flex items-center gap-2 text-xs">
|
||||||
|
<div className={`w-2 h-2 rounded-full flex-shrink-0 ${daysSinceUpdate > 30 ? 'bg-amber-400' : 'bg-green-400'}`} />
|
||||||
|
<span className="text-gray-500 truncate">
|
||||||
|
RAG Corpus: {totalChunks} Chunks
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
{daysSinceUpdate > 30 && (
|
||||||
|
<div className="mt-1 text-xs text-amber-600 bg-amber-50 rounded px-2 py-1">
|
||||||
|
Corpus {daysSinceUpdate}d alt — Re-Evaluation empfohlen
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarProps) {
|
export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarProps) {
|
||||||
const pathname = usePathname()
|
const pathname = usePathname()
|
||||||
const { state, packageCompletion, completionPercentage, getCheckpointStatus } = useSDK()
|
const { state, packageCompletion, completionPercentage, getCheckpointStatus } = useSDK()
|
||||||
@@ -391,6 +427,11 @@ export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarP
|
|||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{/* RAG Corpus Staleness Badge */}
|
||||||
|
{!collapsed && state.ragCorpusStatus && (
|
||||||
|
<CorpusStalenessInfo ragCorpusStatus={state.ragCorpusStatus} />
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Navigation - 5 Packages */}
|
{/* Navigation - 5 Packages */}
|
||||||
<nav className="flex-1 overflow-y-auto">
|
<nav className="flex-1 overflow-y-auto">
|
||||||
{SDK_PACKAGES.map(pkg => {
|
{SDK_PACKAGES.map(pkg => {
|
||||||
@@ -510,6 +551,18 @@ export function SDKSidebar({ collapsed = false, onCollapsedChange }: SDKSidebarP
|
|||||||
isActive={pathname === '/sdk/dsms'}
|
isActive={pathname === '/sdk/dsms'}
|
||||||
collapsed={collapsed}
|
collapsed={collapsed}
|
||||||
/>
|
/>
|
||||||
|
<AdditionalModuleItem
|
||||||
|
href="/development/sdk-flow"
|
||||||
|
icon={
|
||||||
|
<svg className="w-5 h-5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2}
|
||||||
|
d="M13 10V3L4 14h7v7l9-11h-7z" />
|
||||||
|
</svg>
|
||||||
|
}
|
||||||
|
label="SDK Flow"
|
||||||
|
isActive={pathname === '/development/sdk-flow'}
|
||||||
|
collapsed={collapsed}
|
||||||
|
/>
|
||||||
{state.companyProfile?.machineBuilder?.ceMarkingRequired && (
|
{state.companyProfile?.machineBuilder?.ceMarkingRequired && (
|
||||||
<AdditionalModuleItem
|
<AdditionalModuleItem
|
||||||
href="/sdk/iace"
|
href="/sdk/iace"
|
||||||
|
|||||||
@@ -102,6 +102,9 @@ const initialState: SDKState = {
|
|||||||
// IACE (Industrial AI Compliance Engine)
|
// IACE (Industrial AI Compliance Engine)
|
||||||
iaceProjects: [],
|
iaceProjects: [],
|
||||||
|
|
||||||
|
// RAG Corpus Versioning
|
||||||
|
ragCorpusStatus: null,
|
||||||
|
|
||||||
// Security
|
// Security
|
||||||
sbom: null,
|
sbom: null,
|
||||||
securityIssues: [],
|
securityIssues: [],
|
||||||
|
|||||||
@@ -975,6 +975,22 @@ export interface SBOMDependency {
|
|||||||
to: string
|
to: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RAG Corpus Versioning
|
||||||
|
export interface RAGCorpusCollectionStatus {
|
||||||
|
id: string
|
||||||
|
current_version: string
|
||||||
|
documents_count: number
|
||||||
|
chunks_count: number
|
||||||
|
regulations: string[]
|
||||||
|
last_updated: string
|
||||||
|
digest: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface RAGCorpusStatus {
|
||||||
|
collections: Record<string, RAGCorpusCollectionStatus>
|
||||||
|
fetchedAt: string
|
||||||
|
}
|
||||||
|
|
||||||
export interface SBOM {
|
export interface SBOM {
|
||||||
format: 'CycloneDX' | 'SPDX'
|
format: 'CycloneDX' | 'SPDX'
|
||||||
version: string
|
version: string
|
||||||
@@ -1504,6 +1520,9 @@ export interface SDKState {
|
|||||||
// IACE (Industrial AI Compliance Engine)
|
// IACE (Industrial AI Compliance Engine)
|
||||||
iaceProjects: IACEProjectSummary[]
|
iaceProjects: IACEProjectSummary[]
|
||||||
|
|
||||||
|
// RAG Corpus Versioning
|
||||||
|
ragCorpusStatus: RAGCorpusStatus | null
|
||||||
|
|
||||||
// Security
|
// Security
|
||||||
sbom: SBOM | null
|
sbom: SBOM | null
|
||||||
securityIssues: SecurityIssue[]
|
securityIssues: SecurityIssue[]
|
||||||
|
|||||||
@@ -62,6 +62,7 @@ func main() {
|
|||||||
dsgvoStore := dsgvo.NewStore(pool)
|
dsgvoStore := dsgvo.NewStore(pool)
|
||||||
uccaStore := ucca.NewStore(pool)
|
uccaStore := ucca.NewStore(pool)
|
||||||
escalationStore := ucca.NewEscalationStore(pool)
|
escalationStore := ucca.NewEscalationStore(pool)
|
||||||
|
corpusVersionStore := ucca.NewCorpusVersionStore(pool)
|
||||||
roadmapStore := roadmap.NewStore(pool)
|
roadmapStore := roadmap.NewStore(pool)
|
||||||
workshopStore := workshop.NewStore(pool)
|
workshopStore := workshop.NewStore(pool)
|
||||||
portfolioStore := portfolio.NewStore(pool)
|
portfolioStore := portfolio.NewStore(pool)
|
||||||
@@ -120,6 +121,7 @@ func main() {
|
|||||||
vendorHandlers := handlers.NewVendorHandlers(vendorStore)
|
vendorHandlers := handlers.NewVendorHandlers(vendorStore)
|
||||||
iaceHandler := handlers.NewIACEHandler(iaceStore)
|
iaceHandler := handlers.NewIACEHandler(iaceStore)
|
||||||
trainingHandlers := handlers.NewTrainingHandlers(trainingStore, contentGenerator)
|
trainingHandlers := handlers.NewTrainingHandlers(trainingStore, contentGenerator)
|
||||||
|
ragHandlers := handlers.NewRAGHandlers(corpusVersionStore)
|
||||||
|
|
||||||
// Initialize middleware
|
// Initialize middleware
|
||||||
rbacMiddleware := rbac.NewMiddleware(rbacService, policyEngine)
|
rbacMiddleware := rbac.NewMiddleware(rbacService, policyEngine)
|
||||||
@@ -345,6 +347,15 @@ func main() {
|
|||||||
uccaRoutes.POST("/dsb-pool", escalationHandlers.AddDSBPoolMember)
|
uccaRoutes.POST("/dsb-pool", escalationHandlers.AddDSBPoolMember)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RAG routes - Legal Corpus Search & Versioning
|
||||||
|
ragRoutes := v1.Group("/rag")
|
||||||
|
{
|
||||||
|
ragRoutes.POST("/search", ragHandlers.Search)
|
||||||
|
ragRoutes.GET("/regulations", ragHandlers.ListRegulations)
|
||||||
|
ragRoutes.GET("/corpus-status", ragHandlers.CorpusStatus)
|
||||||
|
ragRoutes.GET("/corpus-versions/:collection", ragHandlers.CorpusVersionHistory)
|
||||||
|
}
|
||||||
|
|
||||||
// Roadmap routes - Compliance Implementation Roadmaps
|
// Roadmap routes - Compliance Implementation Roadmaps
|
||||||
roadmapRoutes := v1.Group("/roadmaps")
|
roadmapRoutes := v1.Group("/roadmaps")
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -9,13 +9,15 @@ import (
|
|||||||
|
|
||||||
// RAGHandlers handles RAG search API endpoints.
|
// RAGHandlers handles RAG search API endpoints.
|
||||||
type RAGHandlers struct {
|
type RAGHandlers struct {
|
||||||
ragClient *ucca.LegalRAGClient
|
ragClient *ucca.LegalRAGClient
|
||||||
|
corpusVersionStore *ucca.CorpusVersionStore
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewRAGHandlers creates new RAG handlers.
|
// NewRAGHandlers creates new RAG handlers.
|
||||||
func NewRAGHandlers() *RAGHandlers {
|
func NewRAGHandlers(corpusVersionStore *ucca.CorpusVersionStore) *RAGHandlers {
|
||||||
return &RAGHandlers{
|
return &RAGHandlers{
|
||||||
ragClient: ucca.NewLegalRAGClient(),
|
ragClient: ucca.NewLegalRAGClient(),
|
||||||
|
corpusVersionStore: corpusVersionStore,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -74,3 +76,62 @@ func (h *RAGHandlers) ListRegulations(c *gin.Context) {
|
|||||||
"count": len(regs),
|
"count": len(regs),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CorpusStatus returns the current version status of all RAG collections.
|
||||||
|
// GET /sdk/v1/rag/corpus-status
|
||||||
|
func (h *RAGHandlers) CorpusStatus(c *gin.Context) {
|
||||||
|
if h.corpusVersionStore == nil {
|
||||||
|
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "corpus version store not configured"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
versions, err := h.corpusVersionStore.GetAllLatestVersions(c.Request.Context())
|
||||||
|
if err != nil {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch corpus versions: " + err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
collections := make(map[string]gin.H)
|
||||||
|
for _, v := range versions {
|
||||||
|
collections[v.CollectionName] = gin.H{
|
||||||
|
"id": v.ID,
|
||||||
|
"current_version": v.Version,
|
||||||
|
"documents_count": v.DocumentsCount,
|
||||||
|
"chunks_count": v.ChunksCount,
|
||||||
|
"regulations": v.Regulations,
|
||||||
|
"last_updated": v.CreatedAt,
|
||||||
|
"digest": v.Digest,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, gin.H{
|
||||||
|
"collections": collections,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// CorpusVersionHistory returns the version history for a specific collection.
|
||||||
|
// GET /sdk/v1/rag/corpus-versions/:collection
|
||||||
|
func (h *RAGHandlers) CorpusVersionHistory(c *gin.Context) {
|
||||||
|
if h.corpusVersionStore == nil {
|
||||||
|
c.JSON(http.StatusServiceUnavailable, gin.H{"error": "corpus version store not configured"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
collection := c.Param("collection")
|
||||||
|
if collection == "" {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "collection name required"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
versions, err := h.corpusVersionStore.ListCorpusVersions(c.Request.Context(), collection)
|
||||||
|
if err != nil {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to fetch corpus versions: " + err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, gin.H{
|
||||||
|
"collection": collection,
|
||||||
|
"versions": versions,
|
||||||
|
"count": len(versions),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
158
ai-compliance-sdk/internal/ucca/corpus_version.go
Normal file
158
ai-compliance-sdk/internal/ucca/corpus_version.go
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
package ucca
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/jackc/pgx/v5"
|
||||||
|
"github.com/jackc/pgx/v5/pgxpool"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CorpusVersion tracks a specific version of the RAG compliance corpus.
|
||||||
|
type CorpusVersion struct {
|
||||||
|
ID uuid.UUID `json:"id"`
|
||||||
|
Version string `json:"version"` // "2026-03-02.1"
|
||||||
|
CollectionName string `json:"collection_name"` // "bp_compliance_ce"
|
||||||
|
DocumentsCount int `json:"documents_count"`
|
||||||
|
ChunksCount int `json:"chunks_count"`
|
||||||
|
Regulations []string `json:"regulations"` // ["eu_2016_679", ...]
|
||||||
|
Digest string `json:"digest,omitempty"` // SHA256 over chunks
|
||||||
|
IngestionSource string `json:"ingestion_source,omitempty"`
|
||||||
|
Notes string `json:"notes,omitempty"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
CreatedBy string `json:"created_by,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// CorpusVersionStore handles corpus version persistence.
|
||||||
|
type CorpusVersionStore struct {
|
||||||
|
pool *pgxpool.Pool
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewCorpusVersionStore creates a new corpus version store.
|
||||||
|
func NewCorpusVersionStore(pool *pgxpool.Pool) *CorpusVersionStore {
|
||||||
|
return &CorpusVersionStore{pool: pool}
|
||||||
|
}
|
||||||
|
|
||||||
|
// CreateCorpusVersion inserts a new corpus version record.
|
||||||
|
func (s *CorpusVersionStore) CreateCorpusVersion(ctx context.Context, v *CorpusVersion) error {
|
||||||
|
if v.ID == uuid.Nil {
|
||||||
|
v.ID = uuid.New()
|
||||||
|
}
|
||||||
|
if v.CreatedAt.IsZero() {
|
||||||
|
v.CreatedAt = time.Now().UTC()
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := s.pool.Exec(ctx, `
|
||||||
|
INSERT INTO compliance_corpus_versions (
|
||||||
|
id, version, collection_name, documents_count, chunks_count,
|
||||||
|
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
||||||
|
`,
|
||||||
|
v.ID, v.Version, v.CollectionName, v.DocumentsCount, v.ChunksCount,
|
||||||
|
v.Regulations, v.Digest, v.IngestionSource, v.Notes, v.CreatedAt, v.CreatedBy,
|
||||||
|
)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetLatestCorpusVersion returns the most recent version for a collection.
|
||||||
|
func (s *CorpusVersionStore) GetLatestCorpusVersion(ctx context.Context, collection string) (*CorpusVersion, error) {
|
||||||
|
var v CorpusVersion
|
||||||
|
err := s.pool.QueryRow(ctx, `
|
||||||
|
SELECT id, version, collection_name, documents_count, chunks_count,
|
||||||
|
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||||
|
FROM compliance_corpus_versions
|
||||||
|
WHERE collection_name = $1
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT 1
|
||||||
|
`, collection).Scan(
|
||||||
|
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
|
||||||
|
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
|
||||||
|
)
|
||||||
|
if err == pgx.ErrNoRows {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &v, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetCorpusVersionByID retrieves a specific corpus version by ID.
|
||||||
|
func (s *CorpusVersionStore) GetCorpusVersionByID(ctx context.Context, id uuid.UUID) (*CorpusVersion, error) {
|
||||||
|
var v CorpusVersion
|
||||||
|
err := s.pool.QueryRow(ctx, `
|
||||||
|
SELECT id, version, collection_name, documents_count, chunks_count,
|
||||||
|
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||||
|
FROM compliance_corpus_versions
|
||||||
|
WHERE id = $1
|
||||||
|
`, id).Scan(
|
||||||
|
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
|
||||||
|
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
|
||||||
|
)
|
||||||
|
if err == pgx.ErrNoRows {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return &v, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListCorpusVersions returns all versions for a collection, newest first.
|
||||||
|
func (s *CorpusVersionStore) ListCorpusVersions(ctx context.Context, collection string) ([]CorpusVersion, error) {
|
||||||
|
rows, err := s.pool.Query(ctx, `
|
||||||
|
SELECT id, version, collection_name, documents_count, chunks_count,
|
||||||
|
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||||
|
FROM compliance_corpus_versions
|
||||||
|
WHERE collection_name = $1
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
`, collection)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var versions []CorpusVersion
|
||||||
|
for rows.Next() {
|
||||||
|
var v CorpusVersion
|
||||||
|
err := rows.Scan(
|
||||||
|
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
|
||||||
|
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
versions = append(versions, v)
|
||||||
|
}
|
||||||
|
return versions, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAllLatestVersions returns the latest version for every collection.
|
||||||
|
func (s *CorpusVersionStore) GetAllLatestVersions(ctx context.Context) ([]CorpusVersion, error) {
|
||||||
|
rows, err := s.pool.Query(ctx, `
|
||||||
|
SELECT DISTINCT ON (collection_name)
|
||||||
|
id, version, collection_name, documents_count, chunks_count,
|
||||||
|
regulations, digest, ingestion_source, notes, created_at, created_by
|
||||||
|
FROM compliance_corpus_versions
|
||||||
|
ORDER BY collection_name, created_at DESC
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var versions []CorpusVersion
|
||||||
|
for rows.Next() {
|
||||||
|
var v CorpusVersion
|
||||||
|
err := rows.Scan(
|
||||||
|
&v.ID, &v.Version, &v.CollectionName, &v.DocumentsCount, &v.ChunksCount,
|
||||||
|
&v.Regulations, &v.Digest, &v.IngestionSource, &v.Notes, &v.CreatedAt, &v.CreatedBy,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
versions = append(versions, v)
|
||||||
|
}
|
||||||
|
return versions, nil
|
||||||
|
}
|
||||||
@@ -471,6 +471,10 @@ type Assessment struct {
|
|||||||
Art22Risk bool `json:"art22_risk"`
|
Art22Risk bool `json:"art22_risk"`
|
||||||
TrainingAllowed TrainingAllowed `json:"training_allowed"`
|
TrainingAllowed TrainingAllowed `json:"training_allowed"`
|
||||||
|
|
||||||
|
// Corpus Versioning (RAG)
|
||||||
|
CorpusVersionID *uuid.UUID `json:"corpus_version_id,omitempty"`
|
||||||
|
CorpusVersion string `json:"corpus_version,omitempty"`
|
||||||
|
|
||||||
// LLM Explanation (optional)
|
// LLM Explanation (optional)
|
||||||
ExplanationText *string `json:"explanation_text,omitempty"`
|
ExplanationText *string `json:"explanation_text,omitempty"`
|
||||||
ExplanationGeneratedAt *time.Time `json:"explanation_generated_at,omitempty"`
|
ExplanationGeneratedAt *time.Time `json:"explanation_generated_at,omitempty"`
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
|
|||||||
triggered_rules, required_controls, recommended_architecture,
|
triggered_rules, required_controls, recommended_architecture,
|
||||||
forbidden_patterns, example_matches,
|
forbidden_patterns, example_matches,
|
||||||
dsfa_recommended, art22_risk, training_allowed,
|
dsfa_recommended, art22_risk, training_allowed,
|
||||||
|
corpus_version_id, corpus_version,
|
||||||
explanation_text, explanation_generated_at, explanation_model,
|
explanation_text, explanation_generated_at, explanation_model,
|
||||||
domain, created_at, updated_at, created_by
|
domain, created_at, updated_at, created_by
|
||||||
) VALUES (
|
) VALUES (
|
||||||
@@ -61,8 +62,9 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
|
|||||||
$14, $15, $16,
|
$14, $15, $16,
|
||||||
$17, $18,
|
$17, $18,
|
||||||
$19, $20, $21,
|
$19, $20, $21,
|
||||||
$22, $23, $24,
|
$22, $23,
|
||||||
$25, $26, $27, $28
|
$24, $25, $26,
|
||||||
|
$27, $28, $29, $30
|
||||||
)
|
)
|
||||||
`,
|
`,
|
||||||
a.ID, a.TenantID, a.NamespaceID, a.Title, a.PolicyVersion, a.Status,
|
a.ID, a.TenantID, a.NamespaceID, a.Title, a.PolicyVersion, a.Status,
|
||||||
@@ -71,6 +73,7 @@ func (s *Store) CreateAssessment(ctx context.Context, a *Assessment) error {
|
|||||||
triggeredRules, requiredControls, recommendedArchitecture,
|
triggeredRules, requiredControls, recommendedArchitecture,
|
||||||
forbiddenPatterns, exampleMatches,
|
forbiddenPatterns, exampleMatches,
|
||||||
a.DSFARecommended, a.Art22Risk, string(a.TrainingAllowed),
|
a.DSFARecommended, a.Art22Risk, string(a.TrainingAllowed),
|
||||||
|
a.CorpusVersionID, a.CorpusVersion,
|
||||||
a.ExplanationText, a.ExplanationGeneratedAt, a.ExplanationModel,
|
a.ExplanationText, a.ExplanationGeneratedAt, a.ExplanationModel,
|
||||||
string(a.Domain), a.CreatedAt, a.UpdatedAt, a.CreatedBy,
|
string(a.Domain), a.CreatedAt, a.UpdatedAt, a.CreatedBy,
|
||||||
)
|
)
|
||||||
@@ -92,6 +95,7 @@ func (s *Store) GetAssessment(ctx context.Context, id uuid.UUID) (*Assessment, e
|
|||||||
triggered_rules, required_controls, recommended_architecture,
|
triggered_rules, required_controls, recommended_architecture,
|
||||||
forbidden_patterns, example_matches,
|
forbidden_patterns, example_matches,
|
||||||
dsfa_recommended, art22_risk, training_allowed,
|
dsfa_recommended, art22_risk, training_allowed,
|
||||||
|
corpus_version_id, corpus_version,
|
||||||
explanation_text, explanation_generated_at, explanation_model,
|
explanation_text, explanation_generated_at, explanation_model,
|
||||||
domain, created_at, updated_at, created_by
|
domain, created_at, updated_at, created_by
|
||||||
FROM ucca_assessments WHERE id = $1
|
FROM ucca_assessments WHERE id = $1
|
||||||
@@ -102,6 +106,7 @@ func (s *Store) GetAssessment(ctx context.Context, id uuid.UUID) (*Assessment, e
|
|||||||
&triggeredRules, &requiredControls, &recommendedArchitecture,
|
&triggeredRules, &requiredControls, &recommendedArchitecture,
|
||||||
&forbiddenPatterns, &exampleMatches,
|
&forbiddenPatterns, &exampleMatches,
|
||||||
&a.DSFARecommended, &a.Art22Risk, &trainingAllowed,
|
&a.DSFARecommended, &a.Art22Risk, &trainingAllowed,
|
||||||
|
&a.CorpusVersionID, &a.CorpusVersion,
|
||||||
&a.ExplanationText, &a.ExplanationGeneratedAt, &a.ExplanationModel,
|
&a.ExplanationText, &a.ExplanationGeneratedAt, &a.ExplanationModel,
|
||||||
&domain, &a.CreatedAt, &a.UpdatedAt, &a.CreatedBy,
|
&domain, &a.CreatedAt, &a.UpdatedAt, &a.CreatedBy,
|
||||||
)
|
)
|
||||||
@@ -141,6 +146,7 @@ func (s *Store) ListAssessments(ctx context.Context, tenantID uuid.UUID, filters
|
|||||||
triggered_rules, required_controls, recommended_architecture,
|
triggered_rules, required_controls, recommended_architecture,
|
||||||
forbidden_patterns, example_matches,
|
forbidden_patterns, example_matches,
|
||||||
dsfa_recommended, art22_risk, training_allowed,
|
dsfa_recommended, art22_risk, training_allowed,
|
||||||
|
corpus_version_id, corpus_version,
|
||||||
explanation_text, explanation_generated_at, explanation_model,
|
explanation_text, explanation_generated_at, explanation_model,
|
||||||
domain, created_at, updated_at, created_by
|
domain, created_at, updated_at, created_by
|
||||||
FROM ucca_assessments WHERE tenant_id = $1`
|
FROM ucca_assessments WHERE tenant_id = $1`
|
||||||
@@ -194,6 +200,7 @@ func (s *Store) ListAssessments(ctx context.Context, tenantID uuid.UUID, filters
|
|||||||
&triggeredRules, &requiredControls, &recommendedArchitecture,
|
&triggeredRules, &requiredControls, &recommendedArchitecture,
|
||||||
&forbiddenPatterns, &exampleMatches,
|
&forbiddenPatterns, &exampleMatches,
|
||||||
&a.DSFARecommended, &a.Art22Risk, &trainingAllowed,
|
&a.DSFARecommended, &a.Art22Risk, &trainingAllowed,
|
||||||
|
&a.CorpusVersionID, &a.CorpusVersion,
|
||||||
&a.ExplanationText, &a.ExplanationGeneratedAt, &a.ExplanationModel,
|
&a.ExplanationText, &a.ExplanationGeneratedAt, &a.ExplanationModel,
|
||||||
&domain, &a.CreatedAt, &a.UpdatedAt, &a.CreatedBy,
|
&domain, &a.CreatedAt, &a.UpdatedAt, &a.CreatedBy,
|
||||||
)
|
)
|
||||||
|
|||||||
35
ai-compliance-sdk/migrations/017_corpus_versioning.sql
Normal file
35
ai-compliance-sdk/migrations/017_corpus_versioning.sql
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
-- =============================================================================
|
||||||
|
-- Migration 017: RAG Corpus Versioning
|
||||||
|
--
|
||||||
|
-- Tracks versions of the RAG corpus so assessments can record which
|
||||||
|
-- corpus version they were evaluated against. Enables staleness detection
|
||||||
|
-- and re-evaluation recommendations.
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS compliance_corpus_versions (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
version VARCHAR(50) NOT NULL, -- "2026-03-02.1"
|
||||||
|
collection_name VARCHAR(100) NOT NULL, -- "bp_compliance_ce"
|
||||||
|
documents_count INT NOT NULL DEFAULT 0,
|
||||||
|
chunks_count INT NOT NULL DEFAULT 0,
|
||||||
|
regulations TEXT[], -- {"eu_2016_679", "eu_2024_1689"}
|
||||||
|
digest VARCHAR(128), -- SHA256 over all chunks
|
||||||
|
ingestion_source VARCHAR(200), -- "ingest-legal-corpus.sh"
|
||||||
|
notes TEXT,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
created_by VARCHAR(100)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_corpus_versions_collection
|
||||||
|
ON compliance_corpus_versions(collection_name);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_corpus_versions_latest
|
||||||
|
ON compliance_corpus_versions(collection_name, created_at DESC);
|
||||||
|
|
||||||
|
-- Add corpus_version_id to ucca_assessments
|
||||||
|
ALTER TABLE ucca_assessments
|
||||||
|
ADD COLUMN IF NOT EXISTS corpus_version_id UUID REFERENCES compliance_corpus_versions(id),
|
||||||
|
ADD COLUMN IF NOT EXISTS corpus_version VARCHAR(50);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_ucca_assessments_corpus_version
|
||||||
|
ON ucca_assessments(corpus_version_id);
|
||||||
503
backend-compliance/compliance/api/source_policy_router.py
Normal file
503
backend-compliance/compliance/api/source_policy_router.py
Normal file
@@ -0,0 +1,503 @@
|
|||||||
|
"""
|
||||||
|
Source Policy Router — Manages allowed compliance data sources.
|
||||||
|
|
||||||
|
Controls which legal sources the RAG corpus may use,
|
||||||
|
operations matrix, PII rules, and provides audit trail.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
GET /api/v1/admin/sources — List all sources
|
||||||
|
POST /api/v1/admin/sources — Add new source
|
||||||
|
GET /api/v1/admin/sources/{id} — Get source by ID
|
||||||
|
PUT /api/v1/admin/sources/{id} — Update source
|
||||||
|
DELETE /api/v1/admin/sources/{id} — Remove source
|
||||||
|
GET /api/v1/admin/operations-matrix — Operations matrix
|
||||||
|
PUT /api/v1/admin/operations/{id} — Update operation
|
||||||
|
GET /api/v1/admin/pii-rules — List PII rules
|
||||||
|
POST /api/v1/admin/pii-rules — Create PII rule
|
||||||
|
PUT /api/v1/admin/pii-rules/{id} — Update PII rule
|
||||||
|
DELETE /api/v1/admin/pii-rules/{id} — Delete PII rule
|
||||||
|
GET /api/v1/admin/policy-audit — Audit trail
|
||||||
|
GET /api/v1/admin/policy-stats — Dashboard statistics
|
||||||
|
GET /api/v1/admin/compliance-report — Compliance report
|
||||||
|
"""
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, List
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException, Depends, Query
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from database import get_db
|
||||||
|
from compliance.db.source_policy_models import (
|
||||||
|
AllowedSourceDB,
|
||||||
|
SourceOperationDB,
|
||||||
|
PIIRuleDB,
|
||||||
|
SourcePolicyAuditDB,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/v1/admin", tags=["source-policy"])
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Pydantic Schemas
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class SourceCreate(BaseModel):
|
||||||
|
domain: str
|
||||||
|
name: str
|
||||||
|
description: Optional[str] = None
|
||||||
|
license: Optional[str] = None
|
||||||
|
legal_basis: Optional[str] = None
|
||||||
|
trust_boost: float = Field(default=0.5, ge=0.0, le=1.0)
|
||||||
|
source_type: str = "legal"
|
||||||
|
active: bool = True
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
|
class SourceUpdate(BaseModel):
|
||||||
|
domain: Optional[str] = None
|
||||||
|
name: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
license: Optional[str] = None
|
||||||
|
legal_basis: Optional[str] = None
|
||||||
|
trust_boost: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
||||||
|
source_type: Optional[str] = None
|
||||||
|
active: Optional[bool] = None
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
|
class SourceResponse(BaseModel):
|
||||||
|
id: str
|
||||||
|
domain: str
|
||||||
|
name: str
|
||||||
|
description: Optional[str] = None
|
||||||
|
license: Optional[str] = None
|
||||||
|
legal_basis: Optional[str] = None
|
||||||
|
trust_boost: float
|
||||||
|
source_type: str
|
||||||
|
active: bool
|
||||||
|
metadata: Optional[dict] = None
|
||||||
|
created_at: str
|
||||||
|
updated_at: Optional[str] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class OperationUpdate(BaseModel):
|
||||||
|
allowed: bool
|
||||||
|
conditions: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class PIIRuleCreate(BaseModel):
|
||||||
|
name: str
|
||||||
|
description: Optional[str] = None
|
||||||
|
pattern: Optional[str] = None
|
||||||
|
category: str
|
||||||
|
action: str = "mask"
|
||||||
|
active: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class PIIRuleUpdate(BaseModel):
|
||||||
|
name: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
pattern: Optional[str] = None
|
||||||
|
category: Optional[str] = None
|
||||||
|
action: Optional[str] = None
|
||||||
|
active: Optional[bool] = None
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Helper: Audit logging
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def _log_audit(db: Session, action: str, entity_type: str, entity_id, old_values=None, new_values=None):
|
||||||
|
audit = SourcePolicyAuditDB(
|
||||||
|
action=action,
|
||||||
|
entity_type=entity_type,
|
||||||
|
entity_id=entity_id,
|
||||||
|
old_values=old_values,
|
||||||
|
new_values=new_values,
|
||||||
|
user_id="system",
|
||||||
|
)
|
||||||
|
db.add(audit)
|
||||||
|
|
||||||
|
|
||||||
|
def _source_to_dict(source: AllowedSourceDB) -> dict:
|
||||||
|
return {
|
||||||
|
"id": str(source.id),
|
||||||
|
"domain": source.domain,
|
||||||
|
"name": source.name,
|
||||||
|
"description": source.description,
|
||||||
|
"license": source.license,
|
||||||
|
"legal_basis": source.legal_basis,
|
||||||
|
"trust_boost": source.trust_boost,
|
||||||
|
"source_type": source.source_type,
|
||||||
|
"active": source.active,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Sources CRUD
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@router.get("/sources")
|
||||||
|
async def list_sources(
|
||||||
|
active_only: bool = Query(False),
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
"""List all allowed sources."""
|
||||||
|
query = db.query(AllowedSourceDB)
|
||||||
|
if active_only:
|
||||||
|
query = query.filter(AllowedSourceDB.active == True)
|
||||||
|
sources = query.order_by(AllowedSourceDB.name).all()
|
||||||
|
return {
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"id": str(s.id),
|
||||||
|
"domain": s.domain,
|
||||||
|
"name": s.name,
|
||||||
|
"description": s.description,
|
||||||
|
"license": s.license,
|
||||||
|
"legal_basis": s.legal_basis,
|
||||||
|
"trust_boost": s.trust_boost,
|
||||||
|
"source_type": s.source_type,
|
||||||
|
"active": s.active,
|
||||||
|
"metadata": s.metadata_,
|
||||||
|
"created_at": s.created_at.isoformat() if s.created_at else None,
|
||||||
|
"updated_at": s.updated_at.isoformat() if s.updated_at else None,
|
||||||
|
}
|
||||||
|
for s in sources
|
||||||
|
],
|
||||||
|
"count": len(sources),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/sources")
|
||||||
|
async def create_source(
|
||||||
|
data: SourceCreate,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
"""Add a new allowed source."""
|
||||||
|
existing = db.query(AllowedSourceDB).filter(AllowedSourceDB.domain == data.domain).first()
|
||||||
|
if existing:
|
||||||
|
raise HTTPException(status_code=409, detail=f"Source with domain '{data.domain}' already exists")
|
||||||
|
|
||||||
|
source = AllowedSourceDB(
|
||||||
|
domain=data.domain,
|
||||||
|
name=data.name,
|
||||||
|
description=data.description,
|
||||||
|
license=data.license,
|
||||||
|
legal_basis=data.legal_basis,
|
||||||
|
trust_boost=data.trust_boost,
|
||||||
|
source_type=data.source_type,
|
||||||
|
active=data.active,
|
||||||
|
metadata_=data.metadata,
|
||||||
|
)
|
||||||
|
db.add(source)
|
||||||
|
_log_audit(db, "create", "source", source.id, new_values=_source_to_dict(source))
|
||||||
|
db.commit()
|
||||||
|
db.refresh(source)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": str(source.id),
|
||||||
|
"domain": source.domain,
|
||||||
|
"name": source.name,
|
||||||
|
"created_at": source.created_at.isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/sources/{source_id}")
|
||||||
|
async def get_source(source_id: str, db: Session = Depends(get_db)):
|
||||||
|
"""Get a specific source."""
|
||||||
|
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
|
||||||
|
if not source:
|
||||||
|
raise HTTPException(status_code=404, detail="Source not found")
|
||||||
|
return {
|
||||||
|
"id": str(source.id),
|
||||||
|
"domain": source.domain,
|
||||||
|
"name": source.name,
|
||||||
|
"description": source.description,
|
||||||
|
"license": source.license,
|
||||||
|
"legal_basis": source.legal_basis,
|
||||||
|
"trust_boost": source.trust_boost,
|
||||||
|
"source_type": source.source_type,
|
||||||
|
"active": source.active,
|
||||||
|
"metadata": source.metadata_,
|
||||||
|
"created_at": source.created_at.isoformat() if source.created_at else None,
|
||||||
|
"updated_at": source.updated_at.isoformat() if source.updated_at else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/sources/{source_id}")
|
||||||
|
async def update_source(
|
||||||
|
source_id: str,
|
||||||
|
data: SourceUpdate,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
"""Update an existing source."""
|
||||||
|
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
|
||||||
|
if not source:
|
||||||
|
raise HTTPException(status_code=404, detail="Source not found")
|
||||||
|
|
||||||
|
old_values = _source_to_dict(source)
|
||||||
|
update_data = data.model_dump(exclude_unset=True)
|
||||||
|
|
||||||
|
# Rename metadata to metadata_ for the DB column
|
||||||
|
if "metadata" in update_data:
|
||||||
|
update_data["metadata_"] = update_data.pop("metadata")
|
||||||
|
|
||||||
|
for key, value in update_data.items():
|
||||||
|
setattr(source, key, value)
|
||||||
|
|
||||||
|
_log_audit(db, "update", "source", source.id, old_values=old_values, new_values=update_data)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(source)
|
||||||
|
|
||||||
|
return {"status": "updated", "id": str(source.id)}
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/sources/{source_id}")
|
||||||
|
async def delete_source(source_id: str, db: Session = Depends(get_db)):
|
||||||
|
"""Remove an allowed source."""
|
||||||
|
source = db.query(AllowedSourceDB).filter(AllowedSourceDB.id == source_id).first()
|
||||||
|
if not source:
|
||||||
|
raise HTTPException(status_code=404, detail="Source not found")
|
||||||
|
|
||||||
|
old_values = _source_to_dict(source)
|
||||||
|
_log_audit(db, "delete", "source", source.id, old_values=old_values)
|
||||||
|
|
||||||
|
# Also delete associated operations
|
||||||
|
db.query(SourceOperationDB).filter(SourceOperationDB.source_id == source_id).delete()
|
||||||
|
db.delete(source)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
return {"status": "deleted", "id": source_id}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Operations Matrix
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@router.get("/operations-matrix")
|
||||||
|
async def get_operations_matrix(db: Session = Depends(get_db)):
|
||||||
|
"""Get the full operations matrix."""
|
||||||
|
operations = db.query(SourceOperationDB).all()
|
||||||
|
return {
|
||||||
|
"operations": [
|
||||||
|
{
|
||||||
|
"id": str(op.id),
|
||||||
|
"source_id": str(op.source_id),
|
||||||
|
"operation": op.operation,
|
||||||
|
"allowed": op.allowed,
|
||||||
|
"conditions": op.conditions,
|
||||||
|
}
|
||||||
|
for op in operations
|
||||||
|
],
|
||||||
|
"count": len(operations),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/operations/{operation_id}")
|
||||||
|
async def update_operation(
|
||||||
|
operation_id: str,
|
||||||
|
data: OperationUpdate,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
"""Update an operation in the matrix."""
|
||||||
|
op = db.query(SourceOperationDB).filter(SourceOperationDB.id == operation_id).first()
|
||||||
|
if not op:
|
||||||
|
raise HTTPException(status_code=404, detail="Operation not found")
|
||||||
|
|
||||||
|
op.allowed = data.allowed
|
||||||
|
if data.conditions is not None:
|
||||||
|
op.conditions = data.conditions
|
||||||
|
|
||||||
|
_log_audit(db, "update", "operation", op.id, new_values={"allowed": data.allowed})
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
return {"status": "updated", "id": str(op.id)}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PII Rules
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@router.get("/pii-rules")
|
||||||
|
async def list_pii_rules(db: Session = Depends(get_db)):
|
||||||
|
"""List all PII rules."""
|
||||||
|
rules = db.query(PIIRuleDB).order_by(PIIRuleDB.category, PIIRuleDB.name).all()
|
||||||
|
return {
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"id": str(r.id),
|
||||||
|
"name": r.name,
|
||||||
|
"description": r.description,
|
||||||
|
"pattern": r.pattern,
|
||||||
|
"category": r.category,
|
||||||
|
"action": r.action,
|
||||||
|
"active": r.active,
|
||||||
|
"created_at": r.created_at.isoformat() if r.created_at else None,
|
||||||
|
}
|
||||||
|
for r in rules
|
||||||
|
],
|
||||||
|
"count": len(rules),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/pii-rules")
|
||||||
|
async def create_pii_rule(data: PIIRuleCreate, db: Session = Depends(get_db)):
|
||||||
|
"""Create a new PII rule."""
|
||||||
|
rule = PIIRuleDB(
|
||||||
|
name=data.name,
|
||||||
|
description=data.description,
|
||||||
|
pattern=data.pattern,
|
||||||
|
category=data.category,
|
||||||
|
action=data.action,
|
||||||
|
active=data.active,
|
||||||
|
)
|
||||||
|
db.add(rule)
|
||||||
|
_log_audit(db, "create", "pii_rule", rule.id, new_values={"name": data.name, "category": data.category})
|
||||||
|
db.commit()
|
||||||
|
db.refresh(rule)
|
||||||
|
|
||||||
|
return {"id": str(rule.id), "name": rule.name}
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/pii-rules/{rule_id}")
|
||||||
|
async def update_pii_rule(rule_id: str, data: PIIRuleUpdate, db: Session = Depends(get_db)):
|
||||||
|
"""Update a PII rule."""
|
||||||
|
rule = db.query(PIIRuleDB).filter(PIIRuleDB.id == rule_id).first()
|
||||||
|
if not rule:
|
||||||
|
raise HTTPException(status_code=404, detail="PII rule not found")
|
||||||
|
|
||||||
|
update_data = data.model_dump(exclude_unset=True)
|
||||||
|
for key, value in update_data.items():
|
||||||
|
setattr(rule, key, value)
|
||||||
|
|
||||||
|
_log_audit(db, "update", "pii_rule", rule.id, new_values=update_data)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
return {"status": "updated", "id": str(rule.id)}
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/pii-rules/{rule_id}")
|
||||||
|
async def delete_pii_rule(rule_id: str, db: Session = Depends(get_db)):
|
||||||
|
"""Delete a PII rule."""
|
||||||
|
rule = db.query(PIIRuleDB).filter(PIIRuleDB.id == rule_id).first()
|
||||||
|
if not rule:
|
||||||
|
raise HTTPException(status_code=404, detail="PII rule not found")
|
||||||
|
|
||||||
|
_log_audit(db, "delete", "pii_rule", rule.id, old_values={"name": rule.name, "category": rule.category})
|
||||||
|
db.delete(rule)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
return {"status": "deleted", "id": rule_id}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Audit Trail
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@router.get("/policy-audit")
|
||||||
|
async def get_policy_audit(
|
||||||
|
limit: int = Query(50, ge=1, le=500),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
entity_type: Optional[str] = None,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
"""Get the audit trail for source policy changes."""
|
||||||
|
query = db.query(SourcePolicyAuditDB)
|
||||||
|
if entity_type:
|
||||||
|
query = query.filter(SourcePolicyAuditDB.entity_type == entity_type)
|
||||||
|
|
||||||
|
total = query.count()
|
||||||
|
entries = query.order_by(SourcePolicyAuditDB.created_at.desc()).offset(offset).limit(limit).all()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"entries": [
|
||||||
|
{
|
||||||
|
"id": str(e.id),
|
||||||
|
"action": e.action,
|
||||||
|
"entity_type": e.entity_type,
|
||||||
|
"entity_id": str(e.entity_id) if e.entity_id else None,
|
||||||
|
"old_values": e.old_values,
|
||||||
|
"new_values": e.new_values,
|
||||||
|
"user_id": e.user_id,
|
||||||
|
"created_at": e.created_at.isoformat() if e.created_at else None,
|
||||||
|
}
|
||||||
|
for e in entries
|
||||||
|
],
|
||||||
|
"total": total,
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Dashboard Statistics
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@router.get("/policy-stats")
|
||||||
|
async def get_policy_stats(db: Session = Depends(get_db)):
|
||||||
|
"""Get dashboard statistics for source policy."""
|
||||||
|
total_sources = db.query(AllowedSourceDB).count()
|
||||||
|
active_sources = db.query(AllowedSourceDB).filter(AllowedSourceDB.active == True).count()
|
||||||
|
pii_rules = db.query(PIIRuleDB).filter(PIIRuleDB.active == True).count()
|
||||||
|
|
||||||
|
# Count audit entries from today
|
||||||
|
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||||
|
blocked_today = db.query(SourcePolicyAuditDB).filter(
|
||||||
|
SourcePolicyAuditDB.action == "delete",
|
||||||
|
SourcePolicyAuditDB.created_at >= today_start,
|
||||||
|
).count()
|
||||||
|
|
||||||
|
blocked_total = db.query(SourcePolicyAuditDB).filter(
|
||||||
|
SourcePolicyAuditDB.action == "delete",
|
||||||
|
).count()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"active_policies": active_sources,
|
||||||
|
"allowed_sources": total_sources,
|
||||||
|
"pii_rules": pii_rules,
|
||||||
|
"blocked_today": blocked_today,
|
||||||
|
"blocked_total": blocked_total,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/compliance-report")
|
||||||
|
async def get_compliance_report(db: Session = Depends(get_db)):
|
||||||
|
"""Generate a compliance report for source policies."""
|
||||||
|
sources = db.query(AllowedSourceDB).filter(AllowedSourceDB.active == True).all()
|
||||||
|
pii_rules = db.query(PIIRuleDB).filter(PIIRuleDB.active == True).all()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"report_date": datetime.utcnow().isoformat(),
|
||||||
|
"summary": {
|
||||||
|
"active_sources": len(sources),
|
||||||
|
"active_pii_rules": len(pii_rules),
|
||||||
|
"source_types": list(set(s.source_type for s in sources)),
|
||||||
|
"licenses": list(set(s.license for s in sources if s.license)),
|
||||||
|
},
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"domain": s.domain,
|
||||||
|
"name": s.name,
|
||||||
|
"license": s.license,
|
||||||
|
"legal_basis": s.legal_basis,
|
||||||
|
"trust_boost": s.trust_boost,
|
||||||
|
}
|
||||||
|
for s in sources
|
||||||
|
],
|
||||||
|
"pii_rules": [
|
||||||
|
{
|
||||||
|
"name": r.name,
|
||||||
|
"category": r.category,
|
||||||
|
"action": r.action,
|
||||||
|
}
|
||||||
|
for r in pii_rules
|
||||||
|
],
|
||||||
|
}
|
||||||
105
backend-compliance/compliance/db/source_policy_models.py
Normal file
105
backend-compliance/compliance/db/source_policy_models.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""
|
||||||
|
SQLAlchemy models for Source Policy Management.
|
||||||
|
|
||||||
|
Tables:
|
||||||
|
- compliance_allowed_sources: Whitelisted data sources for RAG corpus
|
||||||
|
- compliance_source_operations: Operations matrix for source data flows
|
||||||
|
- compliance_pii_rules: PII detection/masking rules for sources
|
||||||
|
- compliance_source_policy_audit: Audit trail for source policy changes
|
||||||
|
"""
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from sqlalchemy import (
|
||||||
|
Column, String, Text, Boolean, DateTime, Float, JSON, Index
|
||||||
|
)
|
||||||
|
from sqlalchemy.dialects.postgresql import UUID
|
||||||
|
|
||||||
|
from classroom_engine.database import Base
|
||||||
|
|
||||||
|
|
||||||
|
class AllowedSourceDB(Base):
|
||||||
|
"""Whitelisted data source for compliance RAG corpus."""
|
||||||
|
|
||||||
|
__tablename__ = 'compliance_allowed_sources'
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
domain = Column(String(255), unique=True, nullable=False)
|
||||||
|
name = Column(String(255), nullable=False)
|
||||||
|
description = Column(Text, nullable=True)
|
||||||
|
license = Column(String(100), nullable=True) # DL-DE-BY-2.0, CC-BY, etc.
|
||||||
|
legal_basis = Column(String(200), nullable=True) # §5 UrhG, etc.
|
||||||
|
trust_boost = Column(Float, default=0.5)
|
||||||
|
source_type = Column(String(50), default='legal') # legal, guidance, template
|
||||||
|
active = Column(Boolean, default=True)
|
||||||
|
metadata_ = Column('metadata', JSON, nullable=True)
|
||||||
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index('idx_allowed_sources_domain', 'domain'),
|
||||||
|
Index('idx_allowed_sources_active', 'active'),
|
||||||
|
)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<AllowedSource {self.domain}: {self.name}>"
|
||||||
|
|
||||||
|
|
||||||
|
class SourceOperationDB(Base):
|
||||||
|
"""Operations matrix entry for source data flows."""
|
||||||
|
|
||||||
|
__tablename__ = 'compliance_source_operations'
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
source_id = Column(UUID(as_uuid=True), nullable=False)
|
||||||
|
operation = Column(String(50), nullable=False) # ingest, search, export, share
|
||||||
|
allowed = Column(Boolean, default=True)
|
||||||
|
conditions = Column(Text, nullable=True) # Conditions for this operation
|
||||||
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index('idx_source_operations_source', 'source_id'),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PIIRuleDB(Base):
|
||||||
|
"""PII detection and masking rule for compliance sources."""
|
||||||
|
|
||||||
|
__tablename__ = 'compliance_pii_rules'
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
name = Column(String(255), nullable=False)
|
||||||
|
description = Column(Text, nullable=True)
|
||||||
|
pattern = Column(Text, nullable=True) # Regex pattern
|
||||||
|
category = Column(String(50), nullable=False) # email, phone, name, address, etc.
|
||||||
|
action = Column(String(20), default='mask') # mask, redact, flag
|
||||||
|
active = Column(Boolean, default=True)
|
||||||
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
updated_at = Column(DateTime, onupdate=datetime.utcnow, nullable=True)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index('idx_pii_rules_category', 'category'),
|
||||||
|
Index('idx_pii_rules_active', 'active'),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SourcePolicyAuditDB(Base):
|
||||||
|
"""Audit trail for source policy changes."""
|
||||||
|
|
||||||
|
__tablename__ = 'compliance_source_policy_audit'
|
||||||
|
|
||||||
|
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
|
||||||
|
action = Column(String(20), nullable=False) # create, update, delete
|
||||||
|
entity_type = Column(String(50), nullable=False) # source, operation, pii_rule
|
||||||
|
entity_id = Column(UUID(as_uuid=True), nullable=True)
|
||||||
|
old_values = Column(JSON, nullable=True)
|
||||||
|
new_values = Column(JSON, nullable=True)
|
||||||
|
user_id = Column(String(100), nullable=True)
|
||||||
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index('idx_source_audit_entity', 'entity_type', 'entity_id'),
|
||||||
|
Index('idx_source_audit_created', 'created_at'),
|
||||||
|
)
|
||||||
@@ -21,6 +21,9 @@ from dsr_admin_api import router as dsr_admin_router, templates_router as dsr_te
|
|||||||
# Compliance framework sub-package
|
# Compliance framework sub-package
|
||||||
from compliance.api import router as compliance_framework_router
|
from compliance.api import router as compliance_framework_router
|
||||||
|
|
||||||
|
# Source Policy
|
||||||
|
from compliance.api.source_policy_router import router as source_policy_router
|
||||||
|
|
||||||
# Middleware
|
# Middleware
|
||||||
from middleware import (
|
from middleware import (
|
||||||
RequestIDMiddleware,
|
RequestIDMiddleware,
|
||||||
@@ -85,6 +88,9 @@ app.include_router(dsr_templates_router, prefix="/api")
|
|||||||
# Compliance Framework (regulations, controls, evidence, risks, audits, ISMS)
|
# Compliance Framework (regulations, controls, evidence, risks, audits, ISMS)
|
||||||
app.include_router(compliance_framework_router, prefix="/api")
|
app.include_router(compliance_framework_router, prefix="/api")
|
||||||
|
|
||||||
|
# Source Policy (allowed sources, PII rules, audit)
|
||||||
|
app.include_router(source_policy_router, prefix="/api")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|||||||
73
backend-compliance/migrations/001_source_policy.sql
Normal file
73
backend-compliance/migrations/001_source_policy.sql
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
-- =============================================================================
|
||||||
|
-- Migration 001: Source Policy Tables
|
||||||
|
--
|
||||||
|
-- Tables for managing allowed compliance data sources, operations matrix,
|
||||||
|
-- PII rules, and audit trail.
|
||||||
|
-- =============================================================================
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS compliance_allowed_sources (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
domain VARCHAR(255) UNIQUE NOT NULL,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
license VARCHAR(100),
|
||||||
|
legal_basis VARCHAR(200),
|
||||||
|
trust_boost FLOAT DEFAULT 0.5,
|
||||||
|
source_type VARCHAR(50) DEFAULT 'legal',
|
||||||
|
active BOOLEAN DEFAULT true,
|
||||||
|
metadata JSON,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_allowed_sources_domain ON compliance_allowed_sources(domain);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_allowed_sources_active ON compliance_allowed_sources(active);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS compliance_source_operations (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
source_id UUID NOT NULL REFERENCES compliance_allowed_sources(id) ON DELETE CASCADE,
|
||||||
|
operation VARCHAR(50) NOT NULL,
|
||||||
|
allowed BOOLEAN DEFAULT true,
|
||||||
|
conditions TEXT,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_source_operations_source ON compliance_source_operations(source_id);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS compliance_pii_rules (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
pattern TEXT,
|
||||||
|
category VARCHAR(50) NOT NULL,
|
||||||
|
action VARCHAR(20) DEFAULT 'mask',
|
||||||
|
active BOOLEAN DEFAULT true,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_pii_rules_category ON compliance_pii_rules(category);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_pii_rules_active ON compliance_pii_rules(active);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS compliance_source_policy_audit (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
action VARCHAR(20) NOT NULL,
|
||||||
|
entity_type VARCHAR(50) NOT NULL,
|
||||||
|
entity_id UUID,
|
||||||
|
old_values JSON,
|
||||||
|
new_values JSON,
|
||||||
|
user_id VARCHAR(100),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_source_audit_entity ON compliance_source_policy_audit(entity_type, entity_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_source_audit_created ON compliance_source_policy_audit(created_at);
|
||||||
|
|
||||||
|
-- Seed default PII rules
|
||||||
|
INSERT INTO compliance_pii_rules (name, category, pattern, action, description) VALUES
|
||||||
|
('E-Mail-Adresse', 'email', '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', 'mask', 'E-Mail-Adressen erkennen und maskieren'),
|
||||||
|
('Telefonnummer', 'phone', '(\+49|0)[0-9\s/-]{8,15}', 'mask', 'Deutsche Telefonnummern erkennen'),
|
||||||
|
('IBAN', 'financial', 'DE[0-9]{2}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{4}\s?[0-9]{2}', 'redact', 'Deutsche IBAN-Nummern erkennen und entfernen'),
|
||||||
|
('Postadresse', 'address', '[0-9]{5}\s+[A-Z][a-z]', 'flag', 'Postleitzahlen mit Ortsnamen markieren')
|
||||||
|
ON CONFLICT DO NOTHING;
|
||||||
971
scripts/ingest-legal-corpus.sh
Executable file
971
scripts/ingest-legal-corpus.sh
Executable file
@@ -0,0 +1,971 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# BreakPilot Compliance — RAG Legal Corpus Ingestion
|
||||||
|
#
|
||||||
|
# Laedt 23 freie Rechtsquellen herunter und ingestiert sie in Qdrant
|
||||||
|
# via die Core RAG-API (Port 8097).
|
||||||
|
#
|
||||||
|
# Ausfuehrung auf dem Mac Mini:
|
||||||
|
# ~/rag-ingestion/ingest-legal-corpus.sh [--skip-download] [--only PHASE]
|
||||||
|
#
|
||||||
|
# Phasen: download, gesetze, eu, templates, datenschutz, verify
|
||||||
|
# =============================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# --- Configuration -----------------------------------------------------------
|
||||||
|
WORK_DIR="${WORK_DIR:-$HOME/rag-ingestion}"
|
||||||
|
RAG_URL="https://localhost:8097/api/v1/documents/upload"
|
||||||
|
QDRANT_URL="http://localhost:6333"
|
||||||
|
SDK_URL="${SDK_URL:-https://localhost:8093}"
|
||||||
|
DB_URL="${DB_URL:-postgresql://localhost:5432/breakpilot?search_path=compliance,core,public}"
|
||||||
|
CURL_OPTS="-sk --connect-timeout 10 --max-time 300"
|
||||||
|
|
||||||
|
# Counters
|
||||||
|
UPLOADED=0
|
||||||
|
FAILED=0
|
||||||
|
SKIPPED=0
|
||||||
|
|
||||||
|
# --- CLI Args ----------------------------------------------------------------
|
||||||
|
SKIP_DOWNLOAD=false
|
||||||
|
ONLY_PHASE=""
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--skip-download) SKIP_DOWNLOAD=true; shift ;;
|
||||||
|
--only) ONLY_PHASE="$2"; shift 2 ;;
|
||||||
|
-h|--help)
|
||||||
|
echo "Usage: $0 [--skip-download] [--only PHASE]"
|
||||||
|
echo "Phases: download, gesetze, eu, templates, datenschutz, verify, version"
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*) echo "Unknown option: $1"; exit 1 ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# --- Helpers -----------------------------------------------------------------
|
||||||
|
log() { echo "[$(date '+%H:%M:%S')] $*"; }
|
||||||
|
ok() { echo "[$(date '+%H:%M:%S')] ✓ $*"; }
|
||||||
|
warn() { echo "[$(date '+%H:%M:%S')] ⚠ $*" >&2; }
|
||||||
|
fail() { echo "[$(date '+%H:%M:%S')] ✗ $*" >&2; }
|
||||||
|
|
||||||
|
upload_file() {
|
||||||
|
local file="$1"
|
||||||
|
local collection="$2"
|
||||||
|
local data_type="$3"
|
||||||
|
local use_case="$4"
|
||||||
|
local year="$5"
|
||||||
|
local metadata_json="$6"
|
||||||
|
local label="${7:-$(basename "$file")}"
|
||||||
|
|
||||||
|
if [[ ! -f "$file" ]]; then
|
||||||
|
warn "File not found: $file"
|
||||||
|
FAILED=$((FAILED + 1))
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local filesize
|
||||||
|
filesize=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null || echo 0)
|
||||||
|
if [[ "$filesize" -lt 100 ]]; then
|
||||||
|
warn "File too small (${filesize}B), skipping: $label"
|
||||||
|
SKIPPED=$((SKIPPED + 1))
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Uploading: $label → $collection ($(( filesize / 1024 ))KB)"
|
||||||
|
|
||||||
|
local response
|
||||||
|
response=$(curl $CURL_OPTS -X POST "$RAG_URL" \
|
||||||
|
-F "file=@${file}" \
|
||||||
|
-F "collection=${collection}" \
|
||||||
|
-F "data_type=${data_type}" \
|
||||||
|
-F "use_case=${use_case}" \
|
||||||
|
-F "year=${year}" \
|
||||||
|
-F "chunk_strategy=recursive" \
|
||||||
|
-F "chunk_size=512" \
|
||||||
|
-F "chunk_overlap=50" \
|
||||||
|
-F "metadata_json=${metadata_json}" \
|
||||||
|
2>/dev/null) || true
|
||||||
|
|
||||||
|
if echo "$response" | grep -q '"chunks_count"'; then
|
||||||
|
local chunks
|
||||||
|
chunks=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('chunks_count',0))" 2>/dev/null || echo "?")
|
||||||
|
ok "$label → $chunks chunks"
|
||||||
|
UPLOADED=$((UPLOADED + 1))
|
||||||
|
elif echo "$response" | grep -q '"vectors_indexed"'; then
|
||||||
|
local vectors
|
||||||
|
vectors=$(echo "$response" | python3 -c "import sys,json; print(json.load(sys.stdin).get('vectors_indexed',0))" 2>/dev/null || echo "?")
|
||||||
|
ok "$label → $vectors vectors"
|
||||||
|
UPLOADED=$((UPLOADED + 1))
|
||||||
|
else
|
||||||
|
fail "Upload failed: $label"
|
||||||
|
fail "Response: $response"
|
||||||
|
FAILED=$((FAILED + 1))
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
clone_repo() {
|
||||||
|
local url="$1"
|
||||||
|
local target="$2"
|
||||||
|
|
||||||
|
if [[ -d "$target" ]]; then
|
||||||
|
log "Repo exists: $target (skipping clone)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Cloning: $url"
|
||||||
|
git clone --depth 1 "$url" "$target" 2>/dev/null || {
|
||||||
|
warn "Clone failed: $url"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
download_pdf() {
|
||||||
|
local url="$1"
|
||||||
|
local target="$2"
|
||||||
|
|
||||||
|
if [[ -f "$target" ]]; then
|
||||||
|
log "PDF exists: $(basename "$target") (skipping)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Downloading: $(basename "$target")"
|
||||||
|
curl $CURL_OPTS -L "$url" -o "$target" 2>/dev/null || {
|
||||||
|
warn "Download failed: $url"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract text from gesetze-im-internet.de HTML page
|
||||||
|
extract_gesetz_html() {
|
||||||
|
local url="$1"
|
||||||
|
local output="$2"
|
||||||
|
local label="$3"
|
||||||
|
|
||||||
|
if [[ -f "$output" ]]; then
|
||||||
|
log "Text exists: $(basename "$output") (skipping)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Extracting: $label from gesetze-im-internet.de"
|
||||||
|
curl $CURL_OPTS -L "$url" 2>/dev/null \
|
||||||
|
| python3 -c "
|
||||||
|
import sys, codecs
|
||||||
|
|
||||||
|
# gesetze-im-internet.de uses ISO-8859-1 encoding
|
||||||
|
sys.stdin = codecs.getreader('iso-8859-1')(sys.stdin.buffer)
|
||||||
|
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
class TextExtractor(HTMLParser):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.text = []
|
||||||
|
self.in_content = False
|
||||||
|
self.skip = False
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
attrs_dict = dict(attrs)
|
||||||
|
if tag == 'div' and 'jnhtml' in attrs_dict.get('class', ''):
|
||||||
|
self.in_content = True
|
||||||
|
if tag in ('script', 'style', 'nav', 'header', 'footer'):
|
||||||
|
self.skip = True
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag in ('script', 'style', 'nav', 'header', 'footer'):
|
||||||
|
self.skip = False
|
||||||
|
if tag in ('p', 'div', 'br', 'h1', 'h2', 'h3', 'h4', 'li'):
|
||||||
|
self.text.append('\n')
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if not self.skip:
|
||||||
|
self.text.append(data)
|
||||||
|
|
||||||
|
parser = TextExtractor()
|
||||||
|
parser.feed(sys.stdin.read())
|
||||||
|
print(''.join(parser.text).strip())
|
||||||
|
" > "$output" || {
|
||||||
|
warn "Extraction failed: $label"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Concatenate Markdown files from bundestag/gesetze repo for a specific law
|
||||||
|
concat_bundestag_gesetz() {
|
||||||
|
local gesetz_dir="$1"
|
||||||
|
local output="$2"
|
||||||
|
local label="$3"
|
||||||
|
|
||||||
|
if [[ ! -d "$gesetz_dir" ]]; then
|
||||||
|
warn "Gesetz directory not found: $gesetz_dir"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "Concatenating: $label"
|
||||||
|
{
|
||||||
|
echo "# $label"
|
||||||
|
echo ""
|
||||||
|
# Sort by paragraph number for correct ordering
|
||||||
|
find "$gesetz_dir" -name "*.md" -type f | sort | while read -r f; do
|
||||||
|
cat "$f"
|
||||||
|
echo ""
|
||||||
|
echo "---"
|
||||||
|
echo ""
|
||||||
|
done
|
||||||
|
} > "$output"
|
||||||
|
}
|
||||||
|
|
||||||
|
collection_count() {
|
||||||
|
local col="$1"
|
||||||
|
curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
||||||
|
| python3 -c "import sys,json; print(json.load(sys.stdin)['result']['points_count'])" 2>/dev/null || echo "?"
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PHASE A: Downloads
|
||||||
|
# =============================================================================
|
||||||
|
phase_download() {
|
||||||
|
log "=========================================="
|
||||||
|
log "PHASE A: Downloads (PDFs + Git-Repos)"
|
||||||
|
log "=========================================="
|
||||||
|
|
||||||
|
mkdir -p "$WORK_DIR"/{pdfs,repos,texts}
|
||||||
|
|
||||||
|
# --- A1: EUR-Lex PDFs ---
|
||||||
|
log "--- EUR-Lex PDFs ---"
|
||||||
|
|
||||||
|
download_pdf \
|
||||||
|
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32022R2065" \
|
||||||
|
"$WORK_DIR/pdfs/dsa_2022_2065.pdf"
|
||||||
|
|
||||||
|
download_pdf \
|
||||||
|
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32002L0058" \
|
||||||
|
"$WORK_DIR/pdfs/eprivacy_2002_58.pdf"
|
||||||
|
|
||||||
|
download_pdf \
|
||||||
|
"https://eur-lex.europa.eu/legal-content/DE/TXT/PDF/?uri=CELEX:32021D0914" \
|
||||||
|
"$WORK_DIR/pdfs/scc_2021_914.pdf"
|
||||||
|
|
||||||
|
# --- A2: Deutsche Gesetze (Einzelparagraphen) ---
|
||||||
|
log "--- Deutsche Gesetze (Einzelparagraphen) ---"
|
||||||
|
|
||||||
|
extract_gesetz_html \
|
||||||
|
"https://www.gesetze-im-internet.de/ddg/__5.html" \
|
||||||
|
"$WORK_DIR/texts/ddg_5.txt" \
|
||||||
|
"DDG § 5 (Impressum)"
|
||||||
|
|
||||||
|
# TDDDG heisst auf gesetze-im-internet.de noch "ttdsg"
|
||||||
|
extract_gesetz_html \
|
||||||
|
"https://www.gesetze-im-internet.de/ttdsg/__25.html" \
|
||||||
|
"$WORK_DIR/texts/tdddg_25.txt" \
|
||||||
|
"TDDDG § 25 (Cookies)"
|
||||||
|
|
||||||
|
extract_gesetz_html \
|
||||||
|
"https://www.gesetze-im-internet.de/urhg/__5.html" \
|
||||||
|
"$WORK_DIR/texts/urhg_5.txt" \
|
||||||
|
"UrhG § 5 (Amtliche Werke)"
|
||||||
|
|
||||||
|
# EGBGB Art. 246a § 1 (enthaelt Verweis auf Muster-Widerrufsbelehrung)
|
||||||
|
extract_gesetz_html \
|
||||||
|
"https://www.gesetze-im-internet.de/bgbeg/art_246a__1.html" \
|
||||||
|
"$WORK_DIR/texts/egbgb_widerruf.txt" \
|
||||||
|
"EGBGB Muster-Widerrufsbelehrung"
|
||||||
|
|
||||||
|
# --- A3: Git-Repos ---
|
||||||
|
log "--- Git-Repos ---"
|
||||||
|
|
||||||
|
clone_repo "https://github.com/bundestag/gesetze.git" \
|
||||||
|
"$WORK_DIR/repos/bundestag-gesetze"
|
||||||
|
|
||||||
|
clone_repo "https://github.com/github/site-policy.git" \
|
||||||
|
"$WORK_DIR/repos/github-site-policy"
|
||||||
|
|
||||||
|
clone_repo "https://github.com/opengovfoundation/site-policy.git" \
|
||||||
|
"$WORK_DIR/repos/opengov-site-policy"
|
||||||
|
|
||||||
|
clone_repo "https://github.com/creativecommons/cc-legal-tools-data.git" \
|
||||||
|
"$WORK_DIR/repos/cc-legal-tools"
|
||||||
|
|
||||||
|
clone_repo "https://github.com/oprvc/oprvc.github.io.git" \
|
||||||
|
"$WORK_DIR/repos/oprvc"
|
||||||
|
|
||||||
|
clone_repo "https://github.com/webflorist/privacy-policy-text.git" \
|
||||||
|
"$WORK_DIR/repos/webflorist"
|
||||||
|
|
||||||
|
clone_repo "https://github.com/Tempest-Solutions-Company/privacy-policy-generator.git" \
|
||||||
|
"$WORK_DIR/repos/tempest-privacy" || true
|
||||||
|
|
||||||
|
clone_repo "https://github.com/Tempest-Solutions-Company/terms-of-service-generator.git" \
|
||||||
|
"$WORK_DIR/repos/tempest-tos" || true
|
||||||
|
|
||||||
|
clone_repo "https://github.com/Tempest-Solutions-Company/cookie-banner-consent-solution.git" \
|
||||||
|
"$WORK_DIR/repos/tempest-cookie" || true
|
||||||
|
|
||||||
|
clone_repo "https://github.com/orestbida/cookieconsent.git" \
|
||||||
|
"$WORK_DIR/repos/cookieconsent" || true
|
||||||
|
|
||||||
|
# CommonPaper hat separate Repos pro Vertragstyp
|
||||||
|
clone_repo "https://github.com/CommonPaper/CSA.git" \
|
||||||
|
"$WORK_DIR/repos/common-paper-csa" || true
|
||||||
|
clone_repo "https://github.com/CommonPaper/SLA.git" \
|
||||||
|
"$WORK_DIR/repos/common-paper-sla" || true
|
||||||
|
clone_repo "https://github.com/CommonPaper/PSA.git" \
|
||||||
|
"$WORK_DIR/repos/common-paper-psa" || true
|
||||||
|
|
||||||
|
# OpenCode.de (Datennutzungsklauseln) - try HTTPS
|
||||||
|
clone_repo "https://gitlab.opencode.de/wernerth/datennutzungsklauseln-muster.git" \
|
||||||
|
"$WORK_DIR/repos/datennutzungsklauseln" || true
|
||||||
|
|
||||||
|
# --- A4: EDPB/EDPS PDFs (verifizierte URLs) ---
|
||||||
|
log "--- EDPB/EDPS Guidance PDFs ---"
|
||||||
|
|
||||||
|
# EDPB Guidelines 05/2020 on Consent
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_202005_consent_en.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edpb_consent_guidelines.pdf"
|
||||||
|
|
||||||
|
# EDPB Guidelines 4/2019 Data Protection by Design and Default
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edpb.europa.eu/sites/default/files/files/file1/edpb_guidelines_201904_dataprotection_by_design_and_by_default_v2.0_en.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edpb_privacy_by_design.pdf"
|
||||||
|
|
||||||
|
# EDPB Guidelines 03/2022 Dark Patterns
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edpb.europa.eu/system/files/2023-02/edpb_03-2022_guidelines_on_deceptive_design_patterns_in_social_media_platform_interfaces_v2_en_0.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edpb_dark_patterns.pdf"
|
||||||
|
|
||||||
|
# EDPB Guidelines 8/2020 Social Media Targeting
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edpb.europa.eu/system/files/2021-04/edpb_guidelines_082020_on_the_targeting_of_social_media_users_en.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edpb_social_media_targeting.pdf"
|
||||||
|
|
||||||
|
# EDPB Cookie Banner Taskforce Report (Jan 2023)
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edpb.europa.eu/system/files/2023-01/edpb_20230118_report_cookie_banner_taskforce_en.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edpb_cookie_banner_taskforce.pdf"
|
||||||
|
|
||||||
|
# EDPB Guidelines 2/2023 ePrivacy Art. 5(3) Technical Scope
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202302_technical_scope_art_53_eprivacydirective_v2_en_0.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edpb_eprivacy_art53.pdf"
|
||||||
|
|
||||||
|
# EDPB Guidelines 1/2024 Legitimate Interest
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edpb.europa.eu/system/files/2024-10/edpb_guidelines_202401_legitimateinterest_en.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edpb_legitimate_interest.pdf"
|
||||||
|
|
||||||
|
# EDPB DPO Coordinated Enforcement Report 2024
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edpb.europa.eu/system/files/2024-01/edpb_report_20240116_cef_dpo_en.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edpb_dpo_report.pdf"
|
||||||
|
|
||||||
|
# EDPS GenAI Orientations (June 2024)
|
||||||
|
download_pdf \
|
||||||
|
"https://www.edps.europa.eu/system/files/2024-06/24-06-03_genai_orientations_en.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edps_generative_ai.pdf"
|
||||||
|
|
||||||
|
# EDPS Digital Ethics Report (2018)
|
||||||
|
download_pdf \
|
||||||
|
"https://edps.europa.eu/sites/edp/files/publication/18-01-25_eag_report_en.pdf" \
|
||||||
|
"$WORK_DIR/pdfs/edps_digital_ethics.pdf"
|
||||||
|
|
||||||
|
# --- A5: Text-Extraktion aus Repos ---
|
||||||
|
log "--- Text-Extraktion aus Repos ---"
|
||||||
|
|
||||||
|
# Bundestag/gesetze: Verfuegbare Gesetze (Repo ist teilweise veraltet)
|
||||||
|
# DDG, TDDDG, EGBGB fehlen im Repo - nur BGB, UrhG, TMG vorhanden
|
||||||
|
local -a bundestag_gesetze=(
|
||||||
|
"b/bgb:BGB"
|
||||||
|
"u/urhg:UrhG"
|
||||||
|
"t/tmg:TMG"
|
||||||
|
)
|
||||||
|
for entry in "${bundestag_gesetze[@]}"; do
|
||||||
|
local path="${entry%%:*}"
|
||||||
|
local label="${entry##*:}"
|
||||||
|
local gesetz_dir="$WORK_DIR/repos/bundestag-gesetze/$path"
|
||||||
|
if [[ -d "$gesetz_dir" ]]; then
|
||||||
|
local name
|
||||||
|
name=$(echo "$label" | tr '[:upper:]' '[:lower:]')
|
||||||
|
concat_bundestag_gesetz "$gesetz_dir" \
|
||||||
|
"$WORK_DIR/texts/bundestag_${name}_komplett.txt" \
|
||||||
|
"$label (komplett)"
|
||||||
|
else
|
||||||
|
warn "Bundestag Gesetz nicht gefunden: $gesetz_dir"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
log "Download phase complete."
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PHASE B: Deutsche Gesetze → bp_compliance_gesetze
|
||||||
|
# =============================================================================
|
||||||
|
phase_gesetze() {
|
||||||
|
log "=========================================="
|
||||||
|
log "PHASE B: Deutsche Gesetze → bp_compliance_gesetze"
|
||||||
|
log "=========================================="
|
||||||
|
|
||||||
|
local col="bp_compliance_gesetze"
|
||||||
|
local before
|
||||||
|
before=$(collection_count "$col")
|
||||||
|
log "Collection $col: $before chunks (before)"
|
||||||
|
|
||||||
|
# B1: Einzelparagraphen
|
||||||
|
upload_file "$WORK_DIR/texts/ddg_5.txt" "$col" "compliance" "legal_reference" "2024" \
|
||||||
|
'{"regulation_id":"ddg_5","regulation_name_de":"Digitale-Dienste-Gesetz § 5","category":"impressum","license":"public_law","source":"gesetze-im-internet.de"}' \
|
||||||
|
"DDG § 5 (Impressumspflicht)"
|
||||||
|
|
||||||
|
upload_file "$WORK_DIR/texts/tdddg_25.txt" "$col" "compliance" "legal_reference" "2024" \
|
||||||
|
'{"regulation_id":"tdddg_25","regulation_name_de":"TDDDG § 25","category":"cookies","license":"public_law","source":"gesetze-im-internet.de"}' \
|
||||||
|
"TDDDG § 25 (Cookies/Endgeraetezugriff)"
|
||||||
|
|
||||||
|
upload_file "$WORK_DIR/texts/urhg_5.txt" "$col" "compliance" "legal_reference" "2024" \
|
||||||
|
'{"regulation_id":"urhg_5","regulation_name_de":"UrhG § 5","category":"urheberrecht","license":"public_law","source":"gesetze-im-internet.de"}' \
|
||||||
|
"UrhG § 5 (Amtliche Werke)"
|
||||||
|
|
||||||
|
upload_file "$WORK_DIR/texts/egbgb_widerruf.txt" "$col" "compliance" "legal_reference" "2024" \
|
||||||
|
'{"regulation_id":"egbgb_widerruf","regulation_name_de":"EGBGB Muster-Widerrufsbelehrung","category":"widerruf","license":"public_law","source":"gesetze-im-internet.de"}' \
|
||||||
|
"EGBGB Muster-Widerrufsbelehrung"
|
||||||
|
|
||||||
|
# B2: Bundestag/gesetze (komplett)
|
||||||
|
local -a bundestag_upload=(
|
||||||
|
"bgb:BGB:Buergerliches Gesetzbuch"
|
||||||
|
"urhg:UrhG:Urheberrechtsgesetz"
|
||||||
|
"tmg:TMG:Telemediengesetz"
|
||||||
|
)
|
||||||
|
for entry in "${bundestag_upload[@]}"; do
|
||||||
|
local gesetz="${entry%%:*}"
|
||||||
|
local rest="${entry#*:}"
|
||||||
|
local label="${rest%%:*}"
|
||||||
|
local fullname="${rest#*:}"
|
||||||
|
local file="$WORK_DIR/texts/bundestag_${gesetz}_komplett.txt"
|
||||||
|
if [[ -f "$file" ]]; then
|
||||||
|
upload_file "$file" "$col" "compliance" "legal_reference" "2024" \
|
||||||
|
"{\"regulation_id\":\"${gesetz}_komplett\",\"regulation_name_de\":\"$fullname ($label komplett)\",\"category\":\"volltext\",\"license\":\"unlicense\",\"source\":\"github.com/bundestag/gesetze\"}" \
|
||||||
|
"$label komplett (Bundestag-Repo)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
local after
|
||||||
|
after=$(collection_count "$col")
|
||||||
|
log "Collection $col: $before → $after chunks"
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PHASE C: EU-Rechtstexte → bp_compliance_ce
|
||||||
|
# =============================================================================
|
||||||
|
phase_eu() {
|
||||||
|
log "=========================================="
|
||||||
|
log "PHASE C: EU-Rechtstexte → bp_compliance_ce"
|
||||||
|
log "=========================================="
|
||||||
|
|
||||||
|
local col="bp_compliance_ce"
|
||||||
|
local before
|
||||||
|
before=$(collection_count "$col")
|
||||||
|
log "Collection $col: $before chunks (before)"
|
||||||
|
|
||||||
|
upload_file "$WORK_DIR/pdfs/dsa_2022_2065.pdf" "$col" "compliance_ce" "legal_reference" "2022" \
|
||||||
|
'{"regulation_id":"eu_2022_2065","regulation_name_de":"Digital Services Act (DSA)","regulation_name_en":"Digital Services Act","regulation_short":"DSA","category":"plattformregulierung","celex":"32022R2065","source":"eur-lex","license":"public_law"}' \
|
||||||
|
"Digital Services Act (EU) 2022/2065"
|
||||||
|
|
||||||
|
upload_file "$WORK_DIR/pdfs/eprivacy_2002_58.pdf" "$col" "compliance_ce" "legal_reference" "2002" \
|
||||||
|
'{"regulation_id":"eu_2002_58","regulation_name_de":"ePrivacy-Richtlinie","regulation_name_en":"ePrivacy Directive","regulation_short":"ePrivacy","category":"datenschutz","celex":"32002L0058","source":"eur-lex","license":"public_law"}' \
|
||||||
|
"ePrivacy-Richtlinie 2002/58/EC"
|
||||||
|
|
||||||
|
upload_file "$WORK_DIR/pdfs/scc_2021_914.pdf" "$col" "compliance_ce" "legal_reference" "2021" \
|
||||||
|
'{"regulation_id":"eu_2021_914","regulation_name_de":"Standardvertragsklauseln (SCC)","regulation_name_en":"Standard Contractual Clauses","regulation_short":"SCC","category":"datentransfer","celex":"32021D0914","source":"eur-lex","license":"public_law"}' \
|
||||||
|
"Standardvertragsklauseln (EU) 2021/914"
|
||||||
|
|
||||||
|
local after
|
||||||
|
after=$(collection_count "$col")
|
||||||
|
log "Collection $col: $before → $after chunks"
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PHASE D: Templates/Textbausteine → bp_legal_templates
|
||||||
|
# =============================================================================
|
||||||
|
phase_templates() {
|
||||||
|
log "=========================================="
|
||||||
|
log "PHASE D: Templates → bp_legal_templates"
|
||||||
|
log "=========================================="
|
||||||
|
|
||||||
|
local col="bp_legal_templates"
|
||||||
|
local before
|
||||||
|
before=$(collection_count "$col")
|
||||||
|
log "Collection $col: $before chunks (before)"
|
||||||
|
|
||||||
|
# --- D1: GitHub Site Policy (CC0) ---
|
||||||
|
local repo="$WORK_DIR/repos/github-site-policy"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- GitHub Site Policy ---"
|
||||||
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f" .md)
|
||||||
|
local doc_type="policy"
|
||||||
|
case "$basename" in
|
||||||
|
*terms*|*tos*|*service*) doc_type="tos" ;;
|
||||||
|
*privacy*|*data*) doc_type="privacy_policy" ;;
|
||||||
|
*dmca*|*copyright*) doc_type="dmca" ;;
|
||||||
|
*acceptable*|*use*) doc_type="acceptable_use" ;;
|
||||||
|
esac
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"github_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/github/site-policy\",\"filename\":\"$basename\"}" \
|
||||||
|
"GitHub: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D2: OpenGov Site Policy (CC0) ---
|
||||||
|
repo="$WORK_DIR/repos/opengov-site-policy"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- OpenGov Site Policy ---"
|
||||||
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" -not -name "CONTRIBUTING.md" | sort | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f" .md)
|
||||||
|
local doc_type="policy"
|
||||||
|
case "$basename" in
|
||||||
|
*terms*|*tos*) doc_type="tos" ;;
|
||||||
|
*privacy*) doc_type="privacy_policy" ;;
|
||||||
|
esac
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"opengov_site_policy\",\"doc_type\":\"$doc_type\",\"license\":\"cc0\",\"source\":\"github.com/opengovfoundation/site-policy\",\"filename\":\"$basename\"}" \
|
||||||
|
"OpenGov: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D3: Creative Commons Legal Tools (CC0) ---
|
||||||
|
repo="$WORK_DIR/repos/cc-legal-tools"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- CC Legal Tools (ausgewaehlte Lizenztexte) ---"
|
||||||
|
# Only ingest the main license deeds (DE legalcode where available, else EN)
|
||||||
|
for license_dir in "$repo"/legalcode/de/CC0_1.0 "$repo"/legalcode/de/CC-BY_4.0 "$repo"/legalcode/de/CC-BY-SA_4.0; do
|
||||||
|
if [[ -d "$license_dir" ]]; then
|
||||||
|
find "$license_dir" -name "*.html" -o -name "*.txt" -o -name "*.md" 2>/dev/null | head -3 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
|
||||||
|
"CC License: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
# Fallback: try top-level legalcode files
|
||||||
|
find "$repo"/legalcode -maxdepth 2 -name "*4.0*legalcode*de*" -type f 2>/dev/null | head -5 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"cc_legal_tools\",\"doc_type\":\"license_text\",\"license\":\"cc0\",\"source\":\"github.com/creativecommons/cc-legal-tools-data\",\"filename\":\"$basename\"}" \
|
||||||
|
"CC License: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D4: opr.vc DSGVO-Mustertexte (CC0) ---
|
||||||
|
repo="$WORK_DIR/repos/oprvc"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- opr.vc DSGVO-Mustertexte ---"
|
||||||
|
# Look for German privacy/DSGVO content
|
||||||
|
find "$repo" \( -name "*.md" -o -name "*.html" -o -name "*.txt" \) \
|
||||||
|
-not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
|
||||||
|
| grep -iE "(datenschutz|privacy|dsgvo|gdpr|impressum)" \
|
||||||
|
| head -20 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
|
||||||
|
"opr.vc: $basename"
|
||||||
|
done
|
||||||
|
# If no specific files found, try all markdown files
|
||||||
|
if [[ $(find "$repo" \( -name "*.md" -o -name "*.html" \) -not -path "*/.git/*" -not -name "README.md" | grep -ciE "(datenschutz|privacy|dsgvo|gdpr)" 2>/dev/null) -eq 0 ]]; then
|
||||||
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" | head -10 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"oprvc\",\"doc_type\":\"privacy_policy\",\"license\":\"cc0\",\"source\":\"github.com/oprvc/oprvc.github.io\",\"filename\":\"$basename\"}" \
|
||||||
|
"opr.vc: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D5: webflorist/privacy-policy-text (MIT) ---
|
||||||
|
repo="$WORK_DIR/repos/webflorist"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- webflorist Privacy Policy Text ---"
|
||||||
|
# Look for JSON/text building blocks (German)
|
||||||
|
find "$repo" \( -name "*.json" -o -name "*.txt" -o -name "*.md" -o -name "*.php" \) \
|
||||||
|
-not -path "*/.git/*" -not -path "*/node_modules/*" -not -name "package*.json" \
|
||||||
|
-not -name "composer.json" -not -name "README.md" 2>/dev/null \
|
||||||
|
| head -20 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"webflorist\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/webflorist/privacy-policy-text\",\"filename\":\"$basename\"}" \
|
||||||
|
"webflorist: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D6: Tempest Privacy Policy Generator (MIT) ---
|
||||||
|
repo="$WORK_DIR/repos/tempest-privacy"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- Tempest Privacy Policy Generator ---"
|
||||||
|
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
||||||
|
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
||||||
|
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
||||||
|
| head -15 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"tempest_privacy\",\"doc_type\":\"privacy_policy\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/privacy-policy-generator\",\"filename\":\"$basename\"}" \
|
||||||
|
"Tempest Privacy: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D7: Tempest Terms of Service Generator (MIT) ---
|
||||||
|
repo="$WORK_DIR/repos/tempest-tos"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- Tempest Terms of Service Generator ---"
|
||||||
|
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
||||||
|
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
||||||
|
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
||||||
|
| head -15 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"tempest_tos\",\"doc_type\":\"tos\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/terms-of-service-generator\",\"filename\":\"$basename\"}" \
|
||||||
|
"Tempest ToS: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D8: Tempest Cookie Banner (MIT) ---
|
||||||
|
repo="$WORK_DIR/repos/tempest-cookie"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- Tempest Cookie Banner ---"
|
||||||
|
find "$repo" \( -name "*.md" -o -name "*.txt" -o -name "*.html" -o -name "*.json" \) \
|
||||||
|
-not -path "*/.git/*" -not -path "*/node_modules/*" \
|
||||||
|
-not -name "package*.json" -not -name "README.md" 2>/dev/null \
|
||||||
|
| head -15 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"tempest_cookie\",\"doc_type\":\"cookie_banner\",\"license\":\"mit\",\"source\":\"github.com/Tempest-Solutions-Company/cookie-banner-consent-solution\",\"filename\":\"$basename\"}" \
|
||||||
|
"Tempest Cookie: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D9: CookieConsent (orestbida) - UI Strings (MIT) ---
|
||||||
|
repo="$WORK_DIR/repos/cookieconsent"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- CookieConsent UI Strings ---"
|
||||||
|
# Look for translation/language files
|
||||||
|
find "$repo" -path "*/translations/*" -o -path "*/languages/*" -o -path "*/i18n/*" -o -path "*/locales/*" 2>/dev/null \
|
||||||
|
| grep -iE "\.(json|js|ts)$" | head -10 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
|
||||||
|
"CookieConsent: $basename"
|
||||||
|
done
|
||||||
|
# Also check for example configs
|
||||||
|
find "$repo" -name "*.md" -path "*/docs/*" 2>/dev/null | head -5 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f")
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"cookieconsent\",\"doc_type\":\"cookie_consent\",\"license\":\"mit\",\"source\":\"github.com/orestbida/cookieconsent\",\"filename\":\"$basename\"}" \
|
||||||
|
"CookieConsent Docs: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- D10: Common Paper (CC BY 4.0) ---
|
||||||
|
log "--- Common Paper Standards ---"
|
||||||
|
local -a cp_repos=(
|
||||||
|
"common-paper-csa:saas_contract:CSA"
|
||||||
|
"common-paper-sla:sla:SLA"
|
||||||
|
"common-paper-psa:psa:PSA"
|
||||||
|
)
|
||||||
|
for entry in "${cp_repos[@]}"; do
|
||||||
|
local cp_dir="${entry%%:*}"
|
||||||
|
local rest="${entry#*:}"
|
||||||
|
local cp_doc_type="${rest%%:*}"
|
||||||
|
local cp_label="${rest#*:}"
|
||||||
|
repo="$WORK_DIR/repos/$cp_dir"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" \
|
||||||
|
-not -name "CONTRIBUTING.md" -not -name "CHANGELOG.md" -not -name "CODE_OF_CONDUCT.md" 2>/dev/null \
|
||||||
|
| head -10 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f" .md)
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"common_paper\",\"doc_type\":\"$cp_doc_type\",\"license\":\"cc_by_4\",\"attribution\":\"Common Paper Inc., licensed under CC BY 4.0\",\"source\":\"github.com/CommonPaper/$cp_label\",\"filename\":\"$basename\"}" \
|
||||||
|
"CommonPaper $cp_label: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# --- D11: Datennutzungsklauseln (CC BY 4.0) ---
|
||||||
|
repo="$WORK_DIR/repos/datennutzungsklauseln"
|
||||||
|
if [[ -d "$repo" ]]; then
|
||||||
|
log "--- Datennutzungsklauseln ---"
|
||||||
|
find "$repo" -name "*.md" -not -path "*/.git/*" -not -name "README.md" 2>/dev/null \
|
||||||
|
| head -15 | while read -r f; do
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$f" .md)
|
||||||
|
upload_file "$f" "$col" "legal_template" "template" "2024" \
|
||||||
|
"{\"source_id\":\"datennutzungsklauseln\",\"doc_type\":\"data_clause\",\"license\":\"cc_by_4\",\"attribution\":\"OpenCode.de, lizenziert unter CC BY 4.0\",\"source\":\"gitlab.opencode.de/wernerth/datennutzungsklauseln-muster\",\"filename\":\"$basename\"}" \
|
||||||
|
"Datennutzungsklausel: $basename"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
local after
|
||||||
|
after=$(collection_count "$col")
|
||||||
|
log "Collection $col: $before → $after chunks"
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz
|
||||||
|
# =============================================================================
|
||||||
|
phase_datenschutz() {
|
||||||
|
log "=========================================="
|
||||||
|
log "PHASE E: Datenschutz-Guidance → bp_compliance_datenschutz"
|
||||||
|
log "=========================================="
|
||||||
|
|
||||||
|
local col="bp_compliance_datenschutz"
|
||||||
|
local before
|
||||||
|
before=$(collection_count "$col")
|
||||||
|
log "Collection $col: $before chunks (before)"
|
||||||
|
|
||||||
|
# EDPB Guidelines
|
||||||
|
for pdf in "$WORK_DIR"/pdfs/edpb_*.pdf; do
|
||||||
|
if [[ -f "$pdf" ]]; then
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$pdf" .pdf)
|
||||||
|
local guideline_name="${basename#edpb_}"
|
||||||
|
guideline_name="${guideline_name//_/ }"
|
||||||
|
upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
||||||
|
"{\"source_id\":\"edpb\",\"doc_type\":\"guidance\",\"guideline_name\":\"$guideline_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Board (EDPB)\",\"source\":\"edpb.europa.eu\"}" \
|
||||||
|
"EDPB: $guideline_name"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# EDPS Guidance
|
||||||
|
for pdf in "$WORK_DIR"/pdfs/edps_*.pdf; do
|
||||||
|
if [[ -f "$pdf" ]]; then
|
||||||
|
local basename
|
||||||
|
basename=$(basename "$pdf" .pdf)
|
||||||
|
local guidance_name="${basename#edps_}"
|
||||||
|
guidance_name="${guidance_name//_/ }"
|
||||||
|
upload_file "$pdf" "$col" "compliance_datenschutz" "guidance" "2024" \
|
||||||
|
"{\"source_id\":\"edps\",\"doc_type\":\"guidance\",\"guidance_name\":\"$guidance_name\",\"license\":\"reuse_notice\",\"attribution\":\"European Data Protection Supervisor (EDPS)\",\"source\":\"edps.europa.eu\"}" \
|
||||||
|
"EDPS: $guidance_name"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
local after
|
||||||
|
after=$(collection_count "$col")
|
||||||
|
log "Collection $col: $before → $after chunks"
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PHASE F: Verifizierung
|
||||||
|
# =============================================================================
|
||||||
|
phase_verify() {
|
||||||
|
log "=========================================="
|
||||||
|
log "PHASE F: Verifizierung"
|
||||||
|
log "=========================================="
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Collection Stats ==="
|
||||||
|
for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
|
||||||
|
local count
|
||||||
|
count=$(collection_count "$col")
|
||||||
|
printf " %-30s %s chunks\n" "$col" "$count"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Test-Suchen ==="
|
||||||
|
|
||||||
|
log "Suche: 'Impressumspflicht digitale Dienste' in bp_compliance_gesetze"
|
||||||
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"query":"Impressumspflicht digitale Dienste","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
||||||
|
| python3 -c "
|
||||||
|
import sys,json
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
results = data.get('results', [])
|
||||||
|
print(f' Treffer: {len(results)}')
|
||||||
|
for r in results[:3]:
|
||||||
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
||||||
|
except: print(' (parse error)')
|
||||||
|
" 2>/dev/null || echo " (search failed)"
|
||||||
|
|
||||||
|
log "Suche: 'Cookie Einwilligung' in bp_compliance_ce"
|
||||||
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"query":"Cookie Einwilligung ePrivacy","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
||||||
|
| python3 -c "
|
||||||
|
import sys,json
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
results = data.get('results', [])
|
||||||
|
print(f' Treffer: {len(results)}')
|
||||||
|
for r in results[:3]:
|
||||||
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
||||||
|
except: print(' (parse error)')
|
||||||
|
" 2>/dev/null || echo " (search failed)"
|
||||||
|
|
||||||
|
log "Suche: 'Privacy Policy Template GDPR' in bp_legal_templates"
|
||||||
|
curl $CURL_OPTS -X POST "https://localhost:8097/api/v1/search" \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-d '{"query":"Privacy Policy Template GDPR","regulation_codes":null,"limit":3,"min_score":0.5}' 2>/dev/null \
|
||||||
|
| python3 -c "
|
||||||
|
import sys,json
|
||||||
|
try:
|
||||||
|
data = json.load(sys.stdin)
|
||||||
|
results = data.get('results', [])
|
||||||
|
print(f' Treffer: {len(results)}')
|
||||||
|
for r in results[:3]:
|
||||||
|
print(f' [{r.get(\"score\",0):.3f}] {r.get(\"regulation_code\",\"?\")} - {r.get(\"content\",\"\")[:80]}...')
|
||||||
|
except: print(' (parse error)')
|
||||||
|
" 2>/dev/null || echo " (search failed)"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PHASE G: Corpus Version Registration
|
||||||
|
# =============================================================================
|
||||||
|
phase_register_version() {
|
||||||
|
log "=========================================="
|
||||||
|
log "PHASE G: Corpus Version Registration"
|
||||||
|
log "=========================================="
|
||||||
|
|
||||||
|
local today
|
||||||
|
today=$(date '+%Y-%m-%d')
|
||||||
|
|
||||||
|
for col in bp_compliance_gesetze bp_compliance_ce bp_legal_templates bp_compliance_datenschutz; do
|
||||||
|
local count
|
||||||
|
count=$(collection_count "$col")
|
||||||
|
|
||||||
|
if [[ "$count" == "?" || "$count" == "0" ]]; then
|
||||||
|
warn "Skipping version for $col (count=$count)"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Determine next version number for today
|
||||||
|
local existing_count
|
||||||
|
existing_count=$(psql "$DB_URL" -tAc \
|
||||||
|
"SELECT COUNT(*) FROM compliance_corpus_versions WHERE collection_name='$col' AND version LIKE '${today}.%'" \
|
||||||
|
2>/dev/null || echo "0")
|
||||||
|
local seq=$((existing_count + 1))
|
||||||
|
local version="${today}.${seq}"
|
||||||
|
|
||||||
|
# Get regulations list based on collection
|
||||||
|
local regs=""
|
||||||
|
case "$col" in
|
||||||
|
bp_compliance_ce)
|
||||||
|
regs='{eu_2022_2065,eu_2002_58,eu_2021_914}'
|
||||||
|
;;
|
||||||
|
bp_compliance_gesetze)
|
||||||
|
regs='{ddg_5,tdddg_25,urhg_5,egbgb_widerruf,bgb_komplett,urhg_komplett,tmg_komplett}'
|
||||||
|
;;
|
||||||
|
bp_legal_templates)
|
||||||
|
regs='{github_site_policy,opengov_site_policy,cc_legal_tools,common_paper,webflorist,tempest,cookieconsent}'
|
||||||
|
;;
|
||||||
|
bp_compliance_datenschutz)
|
||||||
|
regs='{edpb_consent,edpb_privacy_by_design,edpb_dark_patterns,edpb_social_media,edpb_cookie_banner,edps_generative_ai,edps_digital_ethics}'
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Compute digest from Qdrant collection info
|
||||||
|
local digest
|
||||||
|
digest=$(curl -s "${QDRANT_URL}/collections/${col}" 2>/dev/null \
|
||||||
|
| python3 -c "import sys,json,hashlib; d=json.load(sys.stdin); print(hashlib.sha256(json.dumps(d.get('result',{}), sort_keys=True).encode()).hexdigest()[:32])" \
|
||||||
|
2>/dev/null || echo "")
|
||||||
|
|
||||||
|
log "Registering version $version for $col ($count chunks)"
|
||||||
|
|
||||||
|
psql "$DB_URL" -c "
|
||||||
|
INSERT INTO compliance_corpus_versions
|
||||||
|
(version, collection_name, documents_count, chunks_count, regulations, digest, ingestion_source, created_by)
|
||||||
|
VALUES
|
||||||
|
('${version}', '${col}', ${UPLOADED}, ${count}, '${regs}', '${digest}', 'ingest-legal-corpus.sh', 'system')
|
||||||
|
" 2>/dev/null && ok "Version $version registered for $col" || warn "Version registration failed for $col (DB not available?)"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# MAIN
|
||||||
|
# =============================================================================
|
||||||
|
main() {
|
||||||
|
log "=========================================="
|
||||||
|
log "BreakPilot Legal Corpus Ingestion"
|
||||||
|
log "=========================================="
|
||||||
|
log "Work dir: $WORK_DIR"
|
||||||
|
log "RAG API: $RAG_URL"
|
||||||
|
log "Qdrant: $QDRANT_URL"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check RAG API is reachable
|
||||||
|
if ! curl $CURL_OPTS "$RAG_URL" -X POST 2>/dev/null | grep -q "file"; then
|
||||||
|
fail "RAG API not reachable at $RAG_URL"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ok "RAG API reachable"
|
||||||
|
|
||||||
|
# Check Qdrant
|
||||||
|
if ! curl -s "$QDRANT_URL/collections" >/dev/null 2>&1; then
|
||||||
|
fail "Qdrant not reachable at $QDRANT_URL"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ok "Qdrant reachable"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Run phases
|
||||||
|
if [[ -n "$ONLY_PHASE" ]]; then
|
||||||
|
case "$ONLY_PHASE" in
|
||||||
|
download) phase_download ;;
|
||||||
|
gesetze) phase_gesetze ;;
|
||||||
|
eu) phase_eu ;;
|
||||||
|
templates) phase_templates ;;
|
||||||
|
datenschutz) phase_datenschutz ;;
|
||||||
|
verify) phase_verify ;;
|
||||||
|
version) phase_register_version ;;
|
||||||
|
*) fail "Unknown phase: $ONLY_PHASE"; exit 1 ;;
|
||||||
|
esac
|
||||||
|
else
|
||||||
|
if [[ "$SKIP_DOWNLOAD" != "true" ]]; then
|
||||||
|
phase_download
|
||||||
|
else
|
||||||
|
log "Skipping download phase (--skip-download)"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
phase_gesetze
|
||||||
|
echo ""
|
||||||
|
phase_eu
|
||||||
|
echo ""
|
||||||
|
phase_templates
|
||||||
|
echo ""
|
||||||
|
phase_datenschutz
|
||||||
|
echo ""
|
||||||
|
phase_verify
|
||||||
|
echo ""
|
||||||
|
phase_register_version
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
echo ""
|
||||||
|
log "=========================================="
|
||||||
|
log "ERGEBNIS"
|
||||||
|
log "=========================================="
|
||||||
|
log "Uploaded: $UPLOADED"
|
||||||
|
log "Failed: $FAILED"
|
||||||
|
log "Skipped: $SKIPPED"
|
||||||
|
log "=========================================="
|
||||||
|
|
||||||
|
if [[ $FAILED -gt 0 ]]; then
|
||||||
|
warn "$FAILED uploads fehlgeschlagen!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ok "Ingestion abgeschlossen!"
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
Reference in New Issue
Block a user