feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)

New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-13 20:35:15 +01:00
parent 0923c03756
commit 364d2c69ff
34 changed files with 1633 additions and 0 deletions
@@ -0,0 +1 @@
+from .filesystem_crawler import FilesystemCrawler
@@ -0,0 +1,100 @@
+"""Local/NAS directory scanning with SHA-256 delta detection."""
+
+import os
+import hashlib
+import json
+from dataclasses import dataclass
+from datetime import datetime, timezone
+
+from config import settings
+
+
+@dataclass
+class CrawledFile:
+    file_path: str
+    file_name: str
+    file_extension: str
+    file_size_bytes: int
+    file_hash: str
+    relative_path: str
+
+
+class FilesystemCrawler:
+    """Walks a directory tree and discovers document files."""
+
+    def __init__(
+        self,
+        base_path: str,
+        file_extensions: list[str] | None = None,
+        max_depth: int = 5,
+        exclude_patterns: list[str] | None = None,
+    ):
+        self.base_path = base_path
+        self.file_extensions = file_extensions or settings.SUPPORTED_EXTENSIONS
+        self.max_depth = max_depth
+        self.exclude_patterns = exclude_patterns or []
+
+    def _should_exclude(self, path: str) -> bool:
+        for pattern in self.exclude_patterns:
+            if pattern in path:
+                return True
+        return False
+
+    def _hash_file(self, path: str) -> str:
+        sha256 = hashlib.sha256()
+        with open(path, "rb") as f:
+            for chunk in iter(lambda: f.read(8192), b""):
+                sha256.update(chunk)
+        return sha256.hexdigest()
+
+    def crawl(self) -> list[CrawledFile]:
+        """Walk the directory tree and return discovered files."""
+        results: list[CrawledFile] = []
+
+        if not os.path.isdir(self.base_path):
+            return results
+
+        for root, dirs, files in os.walk(self.base_path):
+            # Compute depth relative to base
+            rel = os.path.relpath(root, self.base_path)
+            depth = 0 if rel == "." else rel.count(os.sep) + 1
+            if depth >= self.max_depth:
+                dirs.clear()
+                continue
+
+            if self._should_exclude(root):
+                dirs.clear()
+                continue
+
+            for fname in files:
+                full_path = os.path.join(root, fname)
+                _, ext = os.path.splitext(fname)
+                ext = ext.lower()
+
+                if ext not in self.file_extensions:
+                    continue
+
+                if self._should_exclude(full_path):
+                    continue
+
+                try:
+                    stat = os.stat(full_path)
+                except OSError:
+                    continue
+
+                if stat.st_size > settings.MAX_FILE_SIZE_BYTES:
+                    continue
+
+                file_hash = self._hash_file(full_path)
+                relative = os.path.relpath(full_path, self.base_path)
+
+                results.append(CrawledFile(
+                    file_path=full_path,
+                    file_name=fname,
+                    file_extension=ext,
+                    file_size_bytes=stat.st_size,
+                    file_hash=file_hash,
+                    relative_path=relative,
+                ))
+
+        return results
@@ -0,0 +1,8 @@
+"""SMB/CIFS share scanning — placeholder for Phase 2."""
+
+
+class SMBCrawler:
+    """Placeholder for SMB share scanning in a future phase."""
+
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError("SMB crawling is planned for Phase 2")
				`@@ -0,0 +1 @@`
				`from .filesystem_crawler import FilesystemCrawler`