feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
document-crawler/extractors/__init__.py
Normal file
1
document-crawler/extractors/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .dispatcher import extract_text
|
||||
25
document-crawler/extractors/dispatcher.py
Normal file
25
document-crawler/extractors/dispatcher.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Routes files to the appropriate extractor by extension."""
|
||||
|
||||
from .pdf_extractor import extract_pdf
|
||||
from .docx_extractor import extract_docx
|
||||
from .xlsx_extractor import extract_xlsx
|
||||
from .pptx_extractor import extract_pptx
|
||||
|
||||
EXTRACTORS = {
|
||||
".pdf": extract_pdf,
|
||||
".docx": extract_docx,
|
||||
".xlsx": extract_xlsx,
|
||||
".pptx": extract_pptx,
|
||||
}
|
||||
|
||||
|
||||
def extract_text(file_path: str, extension: str) -> str:
|
||||
"""Extract text from a file based on its extension.
|
||||
|
||||
Returns extracted text or raises ValueError for unsupported types.
|
||||
"""
|
||||
ext = extension.lower()
|
||||
extractor = EXTRACTORS.get(ext)
|
||||
if extractor is None:
|
||||
raise ValueError(f"Unsupported file extension: {ext}")
|
||||
return extractor(file_path)
|
||||
18
document-crawler/extractors/docx_extractor.py
Normal file
18
document-crawler/extractors/docx_extractor.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""DOCX text extraction using python-docx."""
|
||||
|
||||
from docx import Document
|
||||
|
||||
|
||||
def extract_docx(file_path: str) -> str:
|
||||
"""Extract text from a DOCX file."""
|
||||
doc = Document(file_path)
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
|
||||
# Also extract from tables
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
||||
if cells:
|
||||
paragraphs.append(" | ".join(cells))
|
||||
|
||||
return "\n\n".join(paragraphs)
|
||||
15
document-crawler/extractors/pdf_extractor.py
Normal file
15
document-crawler/extractors/pdf_extractor.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""PDF text extraction using PyMuPDF (fitz)."""
|
||||
|
||||
import fitz
|
||||
|
||||
|
||||
def extract_pdf(file_path: str) -> str:
|
||||
"""Extract text from a PDF file."""
|
||||
doc = fitz.open(file_path)
|
||||
pages = []
|
||||
for page in doc:
|
||||
text = page.get_text()
|
||||
if text.strip():
|
||||
pages.append(text)
|
||||
doc.close()
|
||||
return "\n\n".join(pages)
|
||||
22
document-crawler/extractors/pptx_extractor.py
Normal file
22
document-crawler/extractors/pptx_extractor.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""PPTX text extraction using python-pptx."""
|
||||
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
def extract_pptx(file_path: str) -> str:
|
||||
"""Extract text from a PPTX file."""
|
||||
prs = Presentation(file_path)
|
||||
slides = []
|
||||
|
||||
for i, slide in enumerate(prs.slides, 1):
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
text = para.text.strip()
|
||||
if text:
|
||||
texts.append(text)
|
||||
if texts:
|
||||
slides.append(f"[Folie {i}]\n" + "\n".join(texts))
|
||||
|
||||
return "\n\n".join(slides)
|
||||
22
document-crawler/extractors/xlsx_extractor.py
Normal file
22
document-crawler/extractors/xlsx_extractor.py
Normal file
@@ -0,0 +1,22 @@
|
||||
"""XLSX text extraction using openpyxl."""
|
||||
|
||||
from openpyxl import load_workbook
|
||||
|
||||
|
||||
def extract_xlsx(file_path: str) -> str:
|
||||
"""Extract text from an XLSX file."""
|
||||
wb = load_workbook(file_path, read_only=True, data_only=True)
|
||||
sheets = []
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
ws = wb[sheet_name]
|
||||
rows = []
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
cells = [str(c) for c in row if c is not None]
|
||||
if cells:
|
||||
rows.append(" | ".join(cells))
|
||||
if rows:
|
||||
sheets.append(f"[{sheet_name}]\n" + "\n".join(rows))
|
||||
|
||||
wb.close()
|
||||
return "\n\n".join(sheets)
|
||||
Reference in New Issue
Block a user