feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
18
document-crawler/extractors/docx_extractor.py
Normal file
18
document-crawler/extractors/docx_extractor.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""DOCX text extraction using python-docx."""
|
||||
|
||||
from docx import Document
|
||||
|
||||
|
||||
def extract_docx(file_path: str) -> str:
|
||||
"""Extract text from a DOCX file."""
|
||||
doc = Document(file_path)
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
|
||||
# Also extract from tables
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
||||
if cells:
|
||||
paragraphs.append(" | ".join(cells))
|
||||
|
||||
return "\n\n".join(paragraphs)
|
||||
Reference in New Issue
Block a user