Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 36s
CI/CD / test-python-backend-compliance (push) Successful in 32s
CI/CD / test-python-document-crawler (push) Successful in 22s
CI/CD / test-python-dsms-gateway (push) Successful in 19s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
QA pipeline that matches control source_original_text directly against original PDF documents to verify article/paragraph assignments. Covers backfill, dedup, source normalization, Qdrant cleanup, and prod sync. Key results (2026-03-20): - 4,110/7,943 controls matched to PDF (100% for major EU regs) - 3,366 article corrections, 705 new assignments - 1,290 controls from Erwägungsgründe (preamble) identified - 779 controls from Anhänge (annexes) identified Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
132 lines
4.5 KiB
Python
132 lines
4.5 KiB
Python
"""POC v2: Find control's source text in PDF — distinguish headings from cross-refs."""
|
|
import os
|
|
import re
|
|
import fitz # PyMuPDF
|
|
import psycopg2
|
|
import urllib.parse
|
|
import unicodedata
|
|
|
|
PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/battery_2023_1542.pdf")
|
|
|
|
# Step 1: Extract full text from PDF
|
|
print("=== Step 1: Reading PDF ===")
|
|
doc = fitz.open(PDF_PATH)
|
|
full_text = ""
|
|
for page in doc:
|
|
full_text += page.get_text() + "\n"
|
|
print(f" Pages: {len(doc)}, Total chars: {len(full_text)}")
|
|
|
|
def normalize(s):
|
|
"""Remove soft hyphens, normalize whitespace."""
|
|
s = s.replace('\u00ad', '').replace('\xad', '') # soft hyphen
|
|
s = s.replace('\u200b', '') # zero-width space
|
|
s = unicodedata.normalize('NFC', s)
|
|
s = re.sub(r'\s+', ' ', s)
|
|
return s.strip()
|
|
|
|
# Step 2: Build article heading index
|
|
# Article headings in EU regulations are on their own line: "Artikel 76"
|
|
# followed by a title line like: "Rücknahme"
|
|
# Cross-references look like: "gemäß Artikel 290 des Vertrags"
|
|
print("\n=== Step 2: Building article HEADING index ===")
|
|
# Pattern: "Artikel N" at start of line, NOT preceded by text on same line
|
|
heading_pattern = re.compile(r'(?:^|\n)\s*Artikel\s+(\d+[a-z]?)\s*\n', re.MULTILINE)
|
|
headings = []
|
|
for match in heading_pattern.finditer(full_text):
|
|
art_num = int(re.match(r'(\d+)', match.group(1)).group(1))
|
|
# Filter: Batterieverordnung has articles 1-96, not 114/192/290
|
|
if art_num <= 96:
|
|
headings.append((match.start(), match.group(1)))
|
|
|
|
# Sort by position
|
|
headings.sort(key=lambda x: x[0])
|
|
# Deduplicate (keep first occurrence of each article)
|
|
seen = set()
|
|
unique_headings = []
|
|
for pos, num in headings:
|
|
if num not in seen:
|
|
seen.add(num)
|
|
unique_headings.append((pos, num))
|
|
headings = unique_headings
|
|
|
|
print(f" Found {len(headings)} unique article headings")
|
|
for h in headings[:15]:
|
|
# Show context
|
|
ctx = full_text[h[0]:h[0]+60].replace('\n', '|')
|
|
print(f" Pos {h[0]:6d}: Artikel {h[1]:3s} → '{ctx[:50]}'")
|
|
if len(headings) > 15:
|
|
print(f" ... and {len(headings)-15} more (up to Artikel {headings[-1][1]})")
|
|
|
|
# Normalize full text for searching
|
|
full_norm = normalize(full_text)
|
|
|
|
# Precompute normalized heading positions
|
|
heading_norm_positions = []
|
|
for pos, num in headings:
|
|
norm_pos = len(normalize(full_text[:pos]))
|
|
heading_norm_positions.append((norm_pos, num))
|
|
|
|
# Step 3: Get controls from DB
|
|
print("\n=== Step 3: Looking up controls ===")
|
|
db_url = os.environ['DATABASE_URL']
|
|
parsed = urllib.parse.urlparse(db_url)
|
|
conn = psycopg2.connect(
|
|
host=parsed.hostname, port=parsed.port or 5432,
|
|
user=parsed.username, password=parsed.password,
|
|
dbname=parsed.path.lstrip('/'),
|
|
options="-c search_path=compliance,public"
|
|
)
|
|
cur = conn.cursor()
|
|
cur.execute("""
|
|
SELECT id, control_id, title, source_original_text,
|
|
source_citation->>'article' as existing_article
|
|
FROM compliance.canonical_controls
|
|
WHERE source_citation->>'source' LIKE '%%1542%%'
|
|
AND source_original_text IS NOT NULL
|
|
ORDER BY control_id
|
|
""")
|
|
controls = cur.fetchall()
|
|
print(f" Got {len(controls)} controls")
|
|
|
|
# Step 4: Match
|
|
print("\n=== Step 4: Matching controls to PDF articles ===")
|
|
found = 0
|
|
not_found = 0
|
|
results = []
|
|
|
|
for ctrl in controls:
|
|
ctrl_id, control_id, title, orig_text, existing_art = ctrl
|
|
orig_norm = normalize(orig_text)
|
|
|
|
matched = False
|
|
for length in [80, 60, 40, 30]:
|
|
start = max(0, len(orig_norm) // 4)
|
|
snippet = orig_norm[start:start+length]
|
|
if not snippet or len(snippet) < 20:
|
|
continue
|
|
pos = full_norm.find(snippet)
|
|
if pos >= 0:
|
|
# Find which article heading precedes this position
|
|
article = "Preamble"
|
|
for h_pos, h_num in reversed(heading_norm_positions):
|
|
if h_pos <= pos:
|
|
article = h_num
|
|
break
|
|
|
|
status = "MATCH" if existing_art == article else ("NEW" if not existing_art else f"DIFF({existing_art}→{article})")
|
|
print(f" {control_id:10s}: Artikel {article:3s} [{status}] {title[:55]}")
|
|
found += 1
|
|
matched = True
|
|
results.append((ctrl_id, control_id, article))
|
|
break
|
|
|
|
if not matched:
|
|
not_found += 1
|
|
print(f" {control_id:10s}: NOT FOUND {title[:55]}")
|
|
print(f" Text: '{orig_norm[20:70]}...'")
|
|
|
|
print(f"\n=== Result: {found}/{len(controls)} found ({not_found} not found) ===")
|
|
if headings:
|
|
print(f" Articles covered: {headings[0][1]} - {headings[-1][1]}")
|
|
conn.close()
|