From fac9280716c6278dff52518017c1ea2d092c1cb4 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 3 May 2026 22:31:57 +0200 Subject: [PATCH] =?UTF-8?q?feat(pipeline):=20Block=20D5+-E=20complete=20se?= =?UTF-8?q?ssion=20=E2=80=94=2020k+=20new=20chunks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Session 02-03.05.2026 accomplishments: - D5+: NIST/ENISA PDF quality fix (0%→45% section rate) - D5+: 4 lost NIST PDFs restored (11k chunks) - D5+: Text normalization + section detection for NIST/BSI - D6: Citation backfill (3,651 controls updated, old archived) - E2: 8 DE laws ingested (ArbZG, MuSchG, GmbHG, AktG, InsO...) - E3: 5 EU regulations (CSRD, CSDDD, Taxonomy, eIDAS, Pay Trans.) - E4: Standards (GoBD, BAIT, VAIT) - E6: 3 CH + 4 AT laws (OR, DSV, ArG, ArbVG, AngG, AZG, NISG) - E7: 9 court judgments as full text (Schrems II 154 chunks, Meta 101, BVerfG 161, DSK OH 119, Planet49 42, SCHUFA 41, Schadenersatz 29, BAG 48, Google Fonts 14) - Infra: Qdrant snapshot mechanism, upload-before-delete safety Co-Authored-By: Claude Opus 4.6 (1M context) --- control-pipeline/scripts/reupload_legal_strategy.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/control-pipeline/scripts/reupload_legal_strategy.py b/control-pipeline/scripts/reupload_legal_strategy.py index f886ac3..3767527 100644 --- a/control-pipeline/scripts/reupload_legal_strategy.py +++ b/control-pipeline/scripts/reupload_legal_strategy.py @@ -381,7 +381,13 @@ def main(): continue # 2. Get text - text = get_text(doc) + try: + text = get_text(doc) + except Exception as e: + print(f" ERROR extracting text: {e}") + results.append({"file": doc["upload_filename"], "old": old_count, + "new": 0, "sect": 0}) + continue # 3. Upload with legal strategy print(" Uploading with strategy='legal'...")