feat: Add Document Crawler & Auto-Onboarding service (Phase 1.4)
New standalone Python/FastAPI service for automatic compliance document scanning, LLM-based classification, IPFS archival, and gap analysis. Includes extractors (PDF, DOCX, XLSX, PPTX), keyword fallback classifier, compliance matrix, and full REST API on port 8098. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
15
document-crawler/extractors/pdf_extractor.py
Normal file
15
document-crawler/extractors/pdf_extractor.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""PDF text extraction using PyMuPDF (fitz)."""
|
||||
|
||||
import fitz
|
||||
|
||||
|
||||
def extract_pdf(file_path: str) -> str:
|
||||
"""Extract text from a PDF file."""
|
||||
doc = fitz.open(file_path)
|
||||
pages = []
|
||||
for page in doc:
|
||||
text = page.get_text()
|
||||
if text.strip():
|
||||
pages.append(text)
|
||||
doc.close()
|
||||
return "\n\n".join(pages)
|
||||
Reference in New Issue
Block a user