""" Legal Templates CLI — command-line entry point for ingestion and search. Extracted from legal_templates_ingestion.py to keep files under 500 LOC. Usage: python legal_templates_cli.py --ingest-all python legal_templates_cli.py --ingest-source github-site-policy python legal_templates_cli.py --status python legal_templates_cli.py --search "Datenschutzerklaerung" Lizenz: Apache 2.0 """ import asyncio import json from template_sources import TEMPLATE_SOURCES, LicenseType from legal_templates_ingestion import LegalTemplatesIngestion async def main(): """CLI entry point.""" import argparse parser = argparse.ArgumentParser(description="Legal Templates Ingestion") parser.add_argument( "--ingest-all", action="store_true", help="Ingest all enabled sources" ) parser.add_argument( "--ingest-source", type=str, metavar="NAME", help="Ingest a specific source by name" ) parser.add_argument( "--ingest-license", type=str, choices=["cc0", "mit", "cc_by_4", "public_domain"], help="Ingest all sources of a specific license type" ) parser.add_argument( "--max-priority", type=int, default=3, help="Maximum priority level to ingest (1=highest, 5=lowest)" ) parser.add_argument( "--status", action="store_true", help="Show collection status" ) parser.add_argument( "--search", type=str, metavar="QUERY", help="Test search query" ) parser.add_argument( "--template-type", type=str, help="Filter search by template type" ) parser.add_argument( "--language", type=str, help="Filter search by language" ) parser.add_argument( "--reset", action="store_true", help="Reset (delete and recreate) the collection" ) parser.add_argument( "--delete-source", type=str, metavar="NAME", help="Delete all chunks from a source" ) args = parser.parse_args() ingestion = LegalTemplatesIngestion() try: if args.reset: ingestion.reset_collection() print("Collection reset successfully") elif args.delete_source: count = ingestion.delete_source(args.delete_source) print(f"Deleted {count} chunks from {args.delete_source}") elif args.status: status = ingestion.get_status() print(json.dumps(status, indent=2, default=str)) elif args.ingest_all: print(f"Ingesting all sources (max priority: {args.max_priority})...") results = await ingestion.ingest_all(max_priority=args.max_priority) print("\nResults:") for name, status in results.items(): print(f" {name}: {status.chunks_indexed} chunks ({status.status})") if status.errors: for error in status.errors: print(f" ERROR: {error}") total = sum(s.chunks_indexed for s in results.values()) print(f"\nTotal: {total} chunks indexed") elif args.ingest_source: source = next( (s for s in TEMPLATE_SOURCES if s.name == args.ingest_source), None ) if not source: print(f"Unknown source: {args.ingest_source}") print("Available sources:") for s in TEMPLATE_SOURCES: print(f" - {s.name}") return print(f"Ingesting: {source.name}") status = await ingestion.ingest_source(source) print(f"\nResult: {status.chunks_indexed} chunks ({status.status})") if status.errors: for error in status.errors: print(f" ERROR: {error}") elif args.ingest_license: license_type = LicenseType(args.ingest_license) print(f"Ingesting all {license_type.value} sources...") results = await ingestion.ingest_by_license(license_type) print("\nResults:") for name, status in results.items(): print(f" {name}: {status.chunks_indexed} chunks ({status.status})") elif args.search: print(f"Searching: {args.search}") results = await ingestion.search( args.search, template_type=args.template_type, language=args.language, ) print(f"\nFound {len(results)} results:") for i, result in enumerate(results, 1): print(f"\n{i}. [{result['template_type']}] {result['document_title']}") print(f" Score: {result['score']:.3f}") print(f" License: {result['license_name']}") print(f" Source: {result['source_name']}") print(f" Language: {result['language']}") if result['attribution_required']: print(f" Attribution: {result['attribution_text']}") print(f" Text: {result['text'][:200]}...") else: parser.print_help() finally: await ingestion.close() if __name__ == "__main__": asyncio.run(main())