[split-required] Split 500-1000 LOC files across all services

backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:35:37 +02:00
parent 6811264756
commit b6983ab1dc
99 changed files with 13484 additions and 16106 deletions
--- a/klausur-service/backend/legal_templates_cli.py
+++ b/klausur-service/backend/legal_templates_cli.py
@@ -0,0 +1,165 @@
+"""
+Legal Templates CLI — command-line entry point for ingestion and search.
+
+Extracted from legal_templates_ingestion.py to keep files under 500 LOC.
+
+Usage:
+    python legal_templates_cli.py --ingest-all
+    python legal_templates_cli.py --ingest-source github-site-policy
+    python legal_templates_cli.py --status
+    python legal_templates_cli.py --search "Datenschutzerklaerung"
+
+Lizenz: Apache 2.0
+"""
+
+import asyncio
+import json
+
+from template_sources import TEMPLATE_SOURCES, LicenseType
+from legal_templates_ingestion import LegalTemplatesIngestion
+
+
+async def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Legal Templates Ingestion")
+    parser.add_argument(
+        "--ingest-all",
+        action="store_true",
+        help="Ingest all enabled sources"
+    )
+    parser.add_argument(
+        "--ingest-source",
+        type=str,
+        metavar="NAME",
+        help="Ingest a specific source by name"
+    )
+    parser.add_argument(
+        "--ingest-license",
+        type=str,
+        choices=["cc0", "mit", "cc_by_4", "public_domain"],
+        help="Ingest all sources of a specific license type"
+    )
+    parser.add_argument(
+        "--max-priority",
+        type=int,
+        default=3,
+        help="Maximum priority level to ingest (1=highest, 5=lowest)"
+    )
+    parser.add_argument(
+        "--status",
+        action="store_true",
+        help="Show collection status"
+    )
+    parser.add_argument(
+        "--search",
+        type=str,
+        metavar="QUERY",
+        help="Test search query"
+    )
+    parser.add_argument(
+        "--template-type",
+        type=str,
+        help="Filter search by template type"
+    )
+    parser.add_argument(
+        "--language",
+        type=str,
+        help="Filter search by language"
+    )
+    parser.add_argument(
+        "--reset",
+        action="store_true",
+        help="Reset (delete and recreate) the collection"
+    )
+    parser.add_argument(
+        "--delete-source",
+        type=str,
+        metavar="NAME",
+        help="Delete all chunks from a source"
+    )
+
+    args = parser.parse_args()
+
+    ingestion = LegalTemplatesIngestion()
+
+    try:
+        if args.reset:
+            ingestion.reset_collection()
+            print("Collection reset successfully")
+
+        elif args.delete_source:
+            count = ingestion.delete_source(args.delete_source)
+            print(f"Deleted {count} chunks from {args.delete_source}")
+
+        elif args.status:
+            status = ingestion.get_status()
+            print(json.dumps(status, indent=2, default=str))
+
+        elif args.ingest_all:
+            print(f"Ingesting all sources (max priority: {args.max_priority})...")
+            results = await ingestion.ingest_all(max_priority=args.max_priority)
+            print("\nResults:")
+            for name, status in results.items():
+                print(f"  {name}: {status.chunks_indexed} chunks ({status.status})")
+                if status.errors:
+                    for error in status.errors:
+                        print(f"    ERROR: {error}")
+            total = sum(s.chunks_indexed for s in results.values())
+            print(f"\nTotal: {total} chunks indexed")
+
+        elif args.ingest_source:
+            source = next(
+                (s for s in TEMPLATE_SOURCES if s.name == args.ingest_source),
+                None
+            )
+            if not source:
+                print(f"Unknown source: {args.ingest_source}")
+                print("Available sources:")
+                for s in TEMPLATE_SOURCES:
+                    print(f"  - {s.name}")
+                return
+
+            print(f"Ingesting: {source.name}")
+            status = await ingestion.ingest_source(source)
+            print(f"\nResult: {status.chunks_indexed} chunks ({status.status})")
+            if status.errors:
+                for error in status.errors:
+                    print(f"  ERROR: {error}")
+
+        elif args.ingest_license:
+            license_type = LicenseType(args.ingest_license)
+            print(f"Ingesting all {license_type.value} sources...")
+            results = await ingestion.ingest_by_license(license_type)
+            print("\nResults:")
+            for name, status in results.items():
+                print(f"  {name}: {status.chunks_indexed} chunks ({status.status})")
+
+        elif args.search:
+            print(f"Searching: {args.search}")
+            results = await ingestion.search(
+                args.search,
+                template_type=args.template_type,
+                language=args.language,
+            )
+            print(f"\nFound {len(results)} results:")
+            for i, result in enumerate(results, 1):
+                print(f"\n{i}. [{result['template_type']}] {result['document_title']}")
+                print(f"   Score: {result['score']:.3f}")
+                print(f"   License: {result['license_name']}")
+                print(f"   Source: {result['source_name']}")
+                print(f"   Language: {result['language']}")
+                if result['attribution_required']:
+                    print(f"   Attribution: {result['attribution_text']}")
+                print(f"   Text: {result['text'][:200]}...")
+
+        else:
+            parser.print_help()
+
+    finally:
+        await ingestion.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())