feat(control-pipeline): persist SGE Knowledge Compiler (capability execution engine)

Move the proven, behavior-equivalent SGE compiler from the macmini build env into the repo (release hygiene, no behavior change, no new capability/architecture): - scripts/sge_build.py — structured guidance extractor + capability execution engine. main() runs run_engine(): C6 Tables -> C7 ReadingOrder -> C1/C2 Sections -> C4 References, order derived topologically from each capability consume/produce contract (not hardcoded). Region ownership: C6 claims tables, C7 residual prose. - scripts/capability_pipeline.py — Region IR {id,bbox,type,state,owner}, claim/consume/produce capabilities, topological resolve_order(). - scripts/reading_order.py — C7 reading-order reconstruction (multi-column reflow; identity gate on single-column so output==input). Verified bit-identical: artifact graph (IDs, parent/child, metadata, text) unchanged vs the pre-engine direct path across 8 docs (4 layout families x EN/DE, 0 mismatch / 0 only_base / 0 only_engine); Golden degraded=0. BUILD_CP default is now __file__-relative so the script self-locates control-pipeline/services.
2026-06-28 13:38:18 +02:00
parent 3b466be140
commit a8412e3db7
3 changed files with 954 additions and 0 deletions
@@ -0,0 +1,107 @@
+"""C7 Reading Order Reconstruction (Pilot). ReadingRegion-Modell, Identity-Gate.
+Scope: NUR Detect Regions / Determine Order / Emit Linear. KEINE Tabellen/Bilder/Sidebars/Fussnoten/Callouts."""
+import statistics
+
+def _lines(words, ytol=3.0):
+    ws=sorted(words,key=lambda w:(round(w["top"],1),w["x0"])); lines=[]; cur=[]; cy=None
+    for w in ws:
+        if cy is None or abs(w["top"]-cy)<=ytol: cur.append(w); cy=w["top"] if cy is None else cy
+        else: lines.append(cur); cur=[w]; cy=w["top"]
+    if cur: lines.append(cur)
+    return lines
+
+def _gutters(words, W):
+    G=160; xs=[W*i/G for i in range(G+1)]
+    cov=[sum(1 for w in words if w["x0"]<=x<=w["x1"]) for x in xs]
+    pos=[c for c in cov if c>0]
+    if not pos: return []
+    body=statistics.median(pos)
+    if body<6: return []
+    thr=max(1,0.15*body); gut=[]; i=0
+    while i<=G:
+        if cov[i]<thr and 0.15*W<=xs[i]<=0.85*W:
+            j=i
+            while j<=G and cov[j]<thr: j+=1
+            if xs[min(j,G)]-xs[i]>=0.02*W: gut.append((xs[i]+xs[min(j-1,G)])/2)
+            i=j
+        else: i+=1
+    return gut
+
+def detect_regions(pg):
+    ws=pg.extract_words(); W=float(pg.width)
+    if len(ws)<60: return {"type":"single","reason":"sparse"}, ws
+    cuts=_gutters(ws,W)
+    if not cuts: return {"type":"single","reason":"no-gutter"}, ws
+    def rc(a,b): return sum(1 for w in ws if a<=(w["x0"]+w["x1"])/2<b)
+    minw=max(25,0.12*len(ws))
+    keep=list(cuts); changed=True
+    while keep and changed:
+        changed=False; bnds=[0]+keep+[W]
+        cnt=[rc(bnds[i],bnds[i+1]) for i in range(len(bnds)-1)]
+        mn=min(range(len(cnt)),key=lambda i:cnt[i])
+        if cnt[mn]<minw:
+            if mn==0: del keep[0]
+            elif mn==len(cnt)-1: del keep[-1]
+            elif cnt[mn-1]<=cnt[mn+1]: del keep[mn-1]
+            else: del keep[mn]
+            changed=True
+    if not keep: return {"type":"single","reason":"thin-merged"}, ws
+    bounds=[0]+keep+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(bounds)-1)]
+    return {"type":"multi","cols":cols,"cuts":keep,"ncols":len(cols)}, ws
+
+def emit_linear(pg):
+    info,ws=detect_regions(pg)
+    if info["type"]=="single": return pg.extract_text() or ""
+    cuts=info["cuts"]; cols=info["cols"]; W=float(pg.width)
+    def colidx(x):
+        for k,c in enumerate(cols):
+            if c[0]<=x<c[1]: return k
+        return len(cols)-1
+    seq=[]
+    for ln in _lines(ws):
+        sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
+        for i in range(1,len(sw)):
+            if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
+            else: frags[-1].append(sw[i])
+        for fr in frags:
+            x0=min(w["x0"] for w in fr); x1=max(w["x1"] for w in fr); top=min(w["top"] for w in fr)
+            text=" ".join(w["text"] for w in fr); spans=sum(1 for c in cuts if x0<c<x1)
+            seq.append(("full",None,top,text) if spans>=1 else ("col",colidx((x0+x1)/2),top,text))
+    out=[]; buf=[]
+    def flush(b):
+        res=[]
+        for k in sorted(set(x[1] for x in b)):
+            for x in sorted([x for x in b if x[1]==k], key=lambda x:x[2]): res.append(x[3])
+        return res
+    for it in seq:
+        if it[0]=="full":
+            if buf: out+=flush(buf); buf=[]
+            out.append(it[3])
+        else: buf.append(it)
+    if buf: out+=flush(buf)
+    return "\n".join(out)
+
+
+def emit_words(ws, W):
+    flat=lambda L: " ".join(w["text"] for w in sorted(L,key=lambda w:(round(w["top"],1),w["x0"])))
+    if len(ws)<60: return flat(ws)
+    cuts=_gutters(ws,W)
+    if not cuts: return flat(ws)
+    bounds=[0]+cuts+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(cuts)+1)]
+    def colidx(x):
+        for k,c in enumerate(cols):
+            if c[0]<=x<c[1]: return k
+        return len(cols)-1
+    buf={}
+    for ln in _lines(ws):
+        sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
+        for i in range(1,len(sw)):
+            if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
+            else: frags[-1].append(sw[i])
+        for fr in frags:
+            mid=(min(w["x0"] for w in fr)+max(w["x1"] for w in fr))/2
+            buf.setdefault(colidx(mid),[]).append((min(w["top"] for w in fr)," ".join(w["text"] for w in fr)))
+    out=[]
+    for k in sorted(buf):
+        for top,t in sorted(buf[k]): out.append(t)
+    return chr(10).join(out)