backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
166 lines
5.2 KiB
Python
166 lines
5.2 KiB
Python
"""
|
|
Legal Templates CLI — command-line entry point for ingestion and search.
|
|
|
|
Extracted from legal_templates_ingestion.py to keep files under 500 LOC.
|
|
|
|
Usage:
|
|
python legal_templates_cli.py --ingest-all
|
|
python legal_templates_cli.py --ingest-source github-site-policy
|
|
python legal_templates_cli.py --status
|
|
python legal_templates_cli.py --search "Datenschutzerklaerung"
|
|
|
|
Lizenz: Apache 2.0
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
|
|
from template_sources import TEMPLATE_SOURCES, LicenseType
|
|
from legal_templates_ingestion import LegalTemplatesIngestion
|
|
|
|
|
|
async def main():
|
|
"""CLI entry point."""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Legal Templates Ingestion")
|
|
parser.add_argument(
|
|
"--ingest-all",
|
|
action="store_true",
|
|
help="Ingest all enabled sources"
|
|
)
|
|
parser.add_argument(
|
|
"--ingest-source",
|
|
type=str,
|
|
metavar="NAME",
|
|
help="Ingest a specific source by name"
|
|
)
|
|
parser.add_argument(
|
|
"--ingest-license",
|
|
type=str,
|
|
choices=["cc0", "mit", "cc_by_4", "public_domain"],
|
|
help="Ingest all sources of a specific license type"
|
|
)
|
|
parser.add_argument(
|
|
"--max-priority",
|
|
type=int,
|
|
default=3,
|
|
help="Maximum priority level to ingest (1=highest, 5=lowest)"
|
|
)
|
|
parser.add_argument(
|
|
"--status",
|
|
action="store_true",
|
|
help="Show collection status"
|
|
)
|
|
parser.add_argument(
|
|
"--search",
|
|
type=str,
|
|
metavar="QUERY",
|
|
help="Test search query"
|
|
)
|
|
parser.add_argument(
|
|
"--template-type",
|
|
type=str,
|
|
help="Filter search by template type"
|
|
)
|
|
parser.add_argument(
|
|
"--language",
|
|
type=str,
|
|
help="Filter search by language"
|
|
)
|
|
parser.add_argument(
|
|
"--reset",
|
|
action="store_true",
|
|
help="Reset (delete and recreate) the collection"
|
|
)
|
|
parser.add_argument(
|
|
"--delete-source",
|
|
type=str,
|
|
metavar="NAME",
|
|
help="Delete all chunks from a source"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
ingestion = LegalTemplatesIngestion()
|
|
|
|
try:
|
|
if args.reset:
|
|
ingestion.reset_collection()
|
|
print("Collection reset successfully")
|
|
|
|
elif args.delete_source:
|
|
count = ingestion.delete_source(args.delete_source)
|
|
print(f"Deleted {count} chunks from {args.delete_source}")
|
|
|
|
elif args.status:
|
|
status = ingestion.get_status()
|
|
print(json.dumps(status, indent=2, default=str))
|
|
|
|
elif args.ingest_all:
|
|
print(f"Ingesting all sources (max priority: {args.max_priority})...")
|
|
results = await ingestion.ingest_all(max_priority=args.max_priority)
|
|
print("\nResults:")
|
|
for name, status in results.items():
|
|
print(f" {name}: {status.chunks_indexed} chunks ({status.status})")
|
|
if status.errors:
|
|
for error in status.errors:
|
|
print(f" ERROR: {error}")
|
|
total = sum(s.chunks_indexed for s in results.values())
|
|
print(f"\nTotal: {total} chunks indexed")
|
|
|
|
elif args.ingest_source:
|
|
source = next(
|
|
(s for s in TEMPLATE_SOURCES if s.name == args.ingest_source),
|
|
None
|
|
)
|
|
if not source:
|
|
print(f"Unknown source: {args.ingest_source}")
|
|
print("Available sources:")
|
|
for s in TEMPLATE_SOURCES:
|
|
print(f" - {s.name}")
|
|
return
|
|
|
|
print(f"Ingesting: {source.name}")
|
|
status = await ingestion.ingest_source(source)
|
|
print(f"\nResult: {status.chunks_indexed} chunks ({status.status})")
|
|
if status.errors:
|
|
for error in status.errors:
|
|
print(f" ERROR: {error}")
|
|
|
|
elif args.ingest_license:
|
|
license_type = LicenseType(args.ingest_license)
|
|
print(f"Ingesting all {license_type.value} sources...")
|
|
results = await ingestion.ingest_by_license(license_type)
|
|
print("\nResults:")
|
|
for name, status in results.items():
|
|
print(f" {name}: {status.chunks_indexed} chunks ({status.status})")
|
|
|
|
elif args.search:
|
|
print(f"Searching: {args.search}")
|
|
results = await ingestion.search(
|
|
args.search,
|
|
template_type=args.template_type,
|
|
language=args.language,
|
|
)
|
|
print(f"\nFound {len(results)} results:")
|
|
for i, result in enumerate(results, 1):
|
|
print(f"\n{i}. [{result['template_type']}] {result['document_title']}")
|
|
print(f" Score: {result['score']:.3f}")
|
|
print(f" License: {result['license_name']}")
|
|
print(f" Source: {result['source_name']}")
|
|
print(f" Language: {result['language']}")
|
|
if result['attribution_required']:
|
|
print(f" Attribution: {result['attribution_text']}")
|
|
print(f" Text: {result['text'][:200]}...")
|
|
|
|
else:
|
|
parser.print_help()
|
|
|
|
finally:
|
|
await ingestion.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|