feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
document-crawler/crawlers/__init__.py
Normal file
1
document-crawler/crawlers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .filesystem_crawler import FilesystemCrawler
|
||||
100
document-crawler/crawlers/filesystem_crawler.py
Normal file
100
document-crawler/crawlers/filesystem_crawler.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Local/NAS directory scanning with SHA-256 delta detection."""
|
||||
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from config import settings
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawledFile:
|
||||
file_path: str
|
||||
file_name: str
|
||||
file_extension: str
|
||||
file_size_bytes: int
|
||||
file_hash: str
|
||||
relative_path: str
|
||||
|
||||
|
||||
class FilesystemCrawler:
|
||||
"""Walks a directory tree and discovers document files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_path: str,
|
||||
file_extensions: list[str] | None = None,
|
||||
max_depth: int = 5,
|
||||
exclude_patterns: list[str] | None = None,
|
||||
):
|
||||
self.base_path = base_path
|
||||
self.file_extensions = file_extensions or settings.SUPPORTED_EXTENSIONS
|
||||
self.max_depth = max_depth
|
||||
self.exclude_patterns = exclude_patterns or []
|
||||
|
||||
def _should_exclude(self, path: str) -> bool:
|
||||
for pattern in self.exclude_patterns:
|
||||
if pattern in path:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _hash_file(self, path: str) -> str:
|
||||
sha256 = hashlib.sha256()
|
||||
with open(path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(8192), b""):
|
||||
sha256.update(chunk)
|
||||
return sha256.hexdigest()
|
||||
|
||||
def crawl(self) -> list[CrawledFile]:
|
||||
"""Walk the directory tree and return discovered files."""
|
||||
results: list[CrawledFile] = []
|
||||
|
||||
if not os.path.isdir(self.base_path):
|
||||
return results
|
||||
|
||||
for root, dirs, files in os.walk(self.base_path):
|
||||
# Compute depth relative to base
|
||||
rel = os.path.relpath(root, self.base_path)
|
||||
depth = 0 if rel == "." else rel.count(os.sep) + 1
|
||||
if depth >= self.max_depth:
|
||||
dirs.clear()
|
||||
continue
|
||||
|
||||
if self._should_exclude(root):
|
||||
dirs.clear()
|
||||
continue
|
||||
|
||||
for fname in files:
|
||||
full_path = os.path.join(root, fname)
|
||||
_, ext = os.path.splitext(fname)
|
||||
ext = ext.lower()
|
||||
|
||||
if ext not in self.file_extensions:
|
||||
continue
|
||||
|
||||
if self._should_exclude(full_path):
|
||||
continue
|
||||
|
||||
try:
|
||||
stat = os.stat(full_path)
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if stat.st_size > settings.MAX_FILE_SIZE_BYTES:
|
||||
continue
|
||||
|
||||
file_hash = self._hash_file(full_path)
|
||||
relative = os.path.relpath(full_path, self.base_path)
|
||||
|
||||
results.append(CrawledFile(
|
||||
file_path=full_path,
|
||||
file_name=fname,
|
||||
file_extension=ext,
|
||||
file_size_bytes=stat.st_size,
|
||||
file_hash=file_hash,
|
||||
relative_path=relative,
|
||||
))
|
||||
|
||||
return results
|
||||
8
document-crawler/crawlers/smb_crawler.py
Normal file
8
document-crawler/crawlers/smb_crawler.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""SMB/CIFS share scanning — placeholder for Phase 2."""
|
||||
|
||||
|
||||
class SMBCrawler:
|
||||
"""Placeholder for SMB share scanning in a future phase."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
raise NotImplementedError("SMB crawling is planned for Phase 2")
|
||||
Reference in New Issue
Block a user