[split-required] Split 500-1000 LOC files across all services
backend-lehrer (5 files): - alerts_agent/db/repository.py (992 → 5), abitur_docs_api.py (956 → 3) - teacher_dashboard_api.py (951 → 3), services/pdf_service.py (916 → 3) - mail/mail_db.py (987 → 6) klausur-service (5 files): - legal_templates_ingestion.py (942 → 3), ocr_pipeline_postprocess.py (929 → 4) - ocr_pipeline_words.py (876 → 3), ocr_pipeline_ocr_merge.py (616 → 2) - KorrekturPage.tsx (956 → 6) website (5 pages): - mail (985 → 9), edu-search (958 → 8), mac-mini (950 → 7) - ocr-labeling (946 → 7), audit-workspace (871 → 4) studio-v2 (5 files + 1 deleted): - page.tsx (946 → 5), MessagesContext.tsx (925 → 4) - korrektur (914 → 6), worksheet-cleanup (899 → 6) - useVocabWorksheet.ts (888 → 3) - Deleted dead page-original.tsx (934 LOC) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
165
klausur-service/backend/legal_templates_cli.py
Normal file
165
klausur-service/backend/legal_templates_cli.py
Normal file
@@ -0,0 +1,165 @@
|
||||
"""
|
||||
Legal Templates CLI — command-line entry point for ingestion and search.
|
||||
|
||||
Extracted from legal_templates_ingestion.py to keep files under 500 LOC.
|
||||
|
||||
Usage:
|
||||
python legal_templates_cli.py --ingest-all
|
||||
python legal_templates_cli.py --ingest-source github-site-policy
|
||||
python legal_templates_cli.py --status
|
||||
python legal_templates_cli.py --search "Datenschutzerklaerung"
|
||||
|
||||
Lizenz: Apache 2.0
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from template_sources import TEMPLATE_SOURCES, LicenseType
|
||||
from legal_templates_ingestion import LegalTemplatesIngestion
|
||||
|
||||
|
||||
async def main():
|
||||
"""CLI entry point."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Legal Templates Ingestion")
|
||||
parser.add_argument(
|
||||
"--ingest-all",
|
||||
action="store_true",
|
||||
help="Ingest all enabled sources"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ingest-source",
|
||||
type=str,
|
||||
metavar="NAME",
|
||||
help="Ingest a specific source by name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ingest-license",
|
||||
type=str,
|
||||
choices=["cc0", "mit", "cc_by_4", "public_domain"],
|
||||
help="Ingest all sources of a specific license type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-priority",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Maximum priority level to ingest (1=highest, 5=lowest)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--status",
|
||||
action="store_true",
|
||||
help="Show collection status"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--search",
|
||||
type=str,
|
||||
metavar="QUERY",
|
||||
help="Test search query"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template-type",
|
||||
type=str,
|
||||
help="Filter search by template type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
type=str,
|
||||
help="Filter search by language"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reset",
|
||||
action="store_true",
|
||||
help="Reset (delete and recreate) the collection"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delete-source",
|
||||
type=str,
|
||||
metavar="NAME",
|
||||
help="Delete all chunks from a source"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
ingestion = LegalTemplatesIngestion()
|
||||
|
||||
try:
|
||||
if args.reset:
|
||||
ingestion.reset_collection()
|
||||
print("Collection reset successfully")
|
||||
|
||||
elif args.delete_source:
|
||||
count = ingestion.delete_source(args.delete_source)
|
||||
print(f"Deleted {count} chunks from {args.delete_source}")
|
||||
|
||||
elif args.status:
|
||||
status = ingestion.get_status()
|
||||
print(json.dumps(status, indent=2, default=str))
|
||||
|
||||
elif args.ingest_all:
|
||||
print(f"Ingesting all sources (max priority: {args.max_priority})...")
|
||||
results = await ingestion.ingest_all(max_priority=args.max_priority)
|
||||
print("\nResults:")
|
||||
for name, status in results.items():
|
||||
print(f" {name}: {status.chunks_indexed} chunks ({status.status})")
|
||||
if status.errors:
|
||||
for error in status.errors:
|
||||
print(f" ERROR: {error}")
|
||||
total = sum(s.chunks_indexed for s in results.values())
|
||||
print(f"\nTotal: {total} chunks indexed")
|
||||
|
||||
elif args.ingest_source:
|
||||
source = next(
|
||||
(s for s in TEMPLATE_SOURCES if s.name == args.ingest_source),
|
||||
None
|
||||
)
|
||||
if not source:
|
||||
print(f"Unknown source: {args.ingest_source}")
|
||||
print("Available sources:")
|
||||
for s in TEMPLATE_SOURCES:
|
||||
print(f" - {s.name}")
|
||||
return
|
||||
|
||||
print(f"Ingesting: {source.name}")
|
||||
status = await ingestion.ingest_source(source)
|
||||
print(f"\nResult: {status.chunks_indexed} chunks ({status.status})")
|
||||
if status.errors:
|
||||
for error in status.errors:
|
||||
print(f" ERROR: {error}")
|
||||
|
||||
elif args.ingest_license:
|
||||
license_type = LicenseType(args.ingest_license)
|
||||
print(f"Ingesting all {license_type.value} sources...")
|
||||
results = await ingestion.ingest_by_license(license_type)
|
||||
print("\nResults:")
|
||||
for name, status in results.items():
|
||||
print(f" {name}: {status.chunks_indexed} chunks ({status.status})")
|
||||
|
||||
elif args.search:
|
||||
print(f"Searching: {args.search}")
|
||||
results = await ingestion.search(
|
||||
args.search,
|
||||
template_type=args.template_type,
|
||||
language=args.language,
|
||||
)
|
||||
print(f"\nFound {len(results)} results:")
|
||||
for i, result in enumerate(results, 1):
|
||||
print(f"\n{i}. [{result['template_type']}] {result['document_title']}")
|
||||
print(f" Score: {result['score']:.3f}")
|
||||
print(f" License: {result['license_name']}")
|
||||
print(f" Source: {result['source_name']}")
|
||||
print(f" Language: {result['language']}")
|
||||
if result['attribution_required']:
|
||||
print(f" Attribution: {result['attribution_text']}")
|
||||
print(f" Text: {result['text'][:200]}...")
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
finally:
|
||||
await ingestion.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user