feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
22
document-crawler/extractors/xlsx_extractor.py
Normal file
22
document-crawler/extractors/xlsx_extractor.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""XLSX text extraction using openpyxl."""
|
||||
|
||||
from openpyxl import load_workbook
|
||||
|
||||
|
||||
def extract_xlsx(file_path: str) -> str:
|
||||
"""Extract text from an XLSX file."""
|
||||
wb = load_workbook(file_path, read_only=True, data_only=True)
|
||||
sheets = []
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
rows = []
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
cells = [str(c) for c in row if c is not None]
|
||||
if cells:
|
||||
rows.append(" | ".join(cells))
|
||||
if rows:
|
||||
sheets.append(f"[{sheet_name}]\n" + "\n".join(rows))
|
||||
|
||||
wb.close()
|
||||
return "\n\n".join(sheets)
|
||||
Reference in New Issue
Block a user