Files
breakpilot-lehrer/klausur-service/backend/legal_templates_cli.py
Benjamin Admin b6983ab1dc [split-required] Split 500-1000 LOC files across all services
backend-lehrer (5 files):
- alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3)
- teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3)
- mail/mail_db.py (987 → 6)

klausur-service (5 files):
- legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4)
- ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2)
- KorrekturPage.tsx (956 → 6)

website (5 pages):
- mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7)
- ocr-labeling (946 → 7), audit-workspace (871 → 4)

studio-v2 (5 files + 1 deleted):
- page.tsx (946 → 5), MessagesContext.tsx (925 → 4)
- korrektur (914 → 6), worksheet-cleanup (899 → 6)
- useVocabWorksheet.ts (888 → 3)
- Deleted dead page-original.tsx (934 LOC)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-24 23:35:37 +02:00

166 lines
5.2 KiB
Python

"""
Legal Templates CLI — command-line entry point for ingestion and search.
Extracted from legal_templates_ingestion.py to keep files under 500 LOC.
Usage:
python legal_templates_cli.py --ingest-all
python legal_templates_cli.py --ingest-source github-site-policy
python legal_templates_cli.py --status
python legal_templates_cli.py --search "Datenschutzerklaerung"
Lizenz: Apache 2.0
"""
import asyncio
import json
from template_sources import TEMPLATE_SOURCES, LicenseType
from legal_templates_ingestion import LegalTemplatesIngestion
async def main():
"""CLI entry point."""
import argparse
parser = argparse.ArgumentParser(description="Legal Templates Ingestion")
parser.add_argument(
"--ingest-all",
action="store_true",
help="Ingest all enabled sources"
)
parser.add_argument(
"--ingest-source",
type=str,
metavar="NAME",
help="Ingest a specific source by name"
)
parser.add_argument(
"--ingest-license",
type=str,
choices=["cc0", "mit", "cc_by_4", "public_domain"],
help="Ingest all sources of a specific license type"
)
parser.add_argument(
"--max-priority",
type=int,
default=3,
help="Maximum priority level to ingest (1=highest, 5=lowest)"
)
parser.add_argument(
"--status",
action="store_true",
help="Show collection status"
)
parser.add_argument(
"--search",
type=str,
metavar="QUERY",
help="Test search query"
)
parser.add_argument(
"--template-type",
type=str,
help="Filter search by template type"
)
parser.add_argument(
"--language",
type=str,
help="Filter search by language"
)
parser.add_argument(
"--reset",
action="store_true",
help="Reset (delete and recreate) the collection"
)
parser.add_argument(
"--delete-source",
type=str,
metavar="NAME",
help="Delete all chunks from a source"
)
args = parser.parse_args()
ingestion = LegalTemplatesIngestion()
try:
if args.reset:
ingestion.reset_collection()
print("Collection reset successfully")
elif args.delete_source:
count = ingestion.delete_source(args.delete_source)
print(f"Deleted {count} chunks from {args.delete_source}")
elif args.status:
status = ingestion.get_status()
print(json.dumps(status, indent=2, default=str))
elif args.ingest_all:
print(f"Ingesting all sources (max priority: {args.max_priority})...")
results = await ingestion.ingest_all(max_priority=args.max_priority)
print("\nResults:")
for name, status in results.items():
print(f" {name}: {status.chunks_indexed} chunks ({status.status})")
if status.errors:
for error in status.errors:
print(f" ERROR: {error}")
total = sum(s.chunks_indexed for s in results.values())
print(f"\nTotal: {total} chunks indexed")
elif args.ingest_source:
source = next(
(s for s in TEMPLATE_SOURCES if s.name == args.ingest_source),
None
)
if not source:
print(f"Unknown source: {args.ingest_source}")
print("Available sources:")
for s in TEMPLATE_SOURCES:
print(f" - {s.name}")
return
print(f"Ingesting: {source.name}")
status = await ingestion.ingest_source(source)
print(f"\nResult: {status.chunks_indexed} chunks ({status.status})")
if status.errors:
for error in status.errors:
print(f" ERROR: {error}")
elif args.ingest_license:
license_type = LicenseType(args.ingest_license)
print(f"Ingesting all {license_type.value} sources...")
results = await ingestion.ingest_by_license(license_type)
print("\nResults:")
for name, status in results.items():
print(f" {name}: {status.chunks_indexed} chunks ({status.status})")
elif args.search:
print(f"Searching: {args.search}")
results = await ingestion.search(
args.search,
template_type=args.template_type,
language=args.language,
)
print(f"\nFound {len(results)} results:")
for i, result in enumerate(results, 1):
print(f"\n{i}. [{result['template_type']}] {result['document_title']}")
print(f" Score: {result['score']:.3f}")
print(f" License: {result['license_name']}")
print(f" Source: {result['source_name']}")
print(f" Language: {result['language']}")
if result['attribution_required']:
print(f" Attribution: {result['attribution_text']}")
print(f" Text: {result['text'][:200]}...")
else:
parser.print_help()
finally:
await ingestion.close()
if __name__ == "__main__":
asyncio.run(main())