Files
breakpilot-core/control-pipeline/scripts/reading_order.py
T
Benjamin Boenisch a8412e3db7 feat(control-pipeline): persist SGE Knowledge Compiler (capability execution engine)
Move the proven, behavior-equivalent SGE compiler from the macmini build env into
the repo (release hygiene, no behavior change, no new capability/architecture):
- scripts/sge_build.py — structured guidance extractor + capability execution
  engine. main() runs run_engine(): C6 Tables -> C7 ReadingOrder -> C1/C2 Sections
  -> C4 References, order derived topologically from each capability consume/produce
  contract (not hardcoded). Region ownership: C6 claims tables, C7 residual prose.
- scripts/capability_pipeline.py — Region IR {id,bbox,type,state,owner},
  claim/consume/produce capabilities, topological resolve_order().
- scripts/reading_order.py — C7 reading-order reconstruction (multi-column reflow;
  identity gate on single-column so output==input).

Verified bit-identical: artifact graph (IDs, parent/child, metadata, text) unchanged
vs the pre-engine direct path across 8 docs (4 layout families x EN/DE,
0 mismatch / 0 only_base / 0 only_engine); Golden degraded=0. BUILD_CP default is now
__file__-relative so the script self-locates control-pipeline/services.
2026-06-28 13:38:18 +02:00

108 lines
4.3 KiB
Python

"""C7 Reading Order Reconstruction (Pilot). ReadingRegion-Modell, Identity-Gate.
Scope: NUR Detect Regions / Determine Order / Emit Linear. KEINE Tabellen/Bilder/Sidebars/Fussnoten/Callouts."""
import statistics
def _lines(words, ytol=3.0):
ws=sorted(words,key=lambda w:(round(w["top"],1),w["x0"])); lines=[]; cur=[]; cy=None
for w in ws:
if cy is None or abs(w["top"]-cy)<=ytol: cur.append(w); cy=w["top"] if cy is None else cy
else: lines.append(cur); cur=[w]; cy=w["top"]
if cur: lines.append(cur)
return lines
def _gutters(words, W):
G=160; xs=[W*i/G for i in range(G+1)]
cov=[sum(1 for w in words if w["x0"]<=x<=w["x1"]) for x in xs]
pos=[c for c in cov if c>0]
if not pos: return []
body=statistics.median(pos)
if body<6: return []
thr=max(1,0.15*body); gut=[]; i=0
while i<=G:
if cov[i]<thr and 0.15*W<=xs[i]<=0.85*W:
j=i
while j<=G and cov[j]<thr: j+=1
if xs[min(j,G)]-xs[i]>=0.02*W: gut.append((xs[i]+xs[min(j-1,G)])/2)
i=j
else: i+=1
return gut
def detect_regions(pg):
ws=pg.extract_words(); W=float(pg.width)
if len(ws)<60: return {"type":"single","reason":"sparse"}, ws
cuts=_gutters(ws,W)
if not cuts: return {"type":"single","reason":"no-gutter"}, ws
def rc(a,b): return sum(1 for w in ws if a<=(w["x0"]+w["x1"])/2<b)
minw=max(25,0.12*len(ws))
keep=list(cuts); changed=True
while keep and changed:
changed=False; bnds=[0]+keep+[W]
cnt=[rc(bnds[i],bnds[i+1]) for i in range(len(bnds)-1)]
mn=min(range(len(cnt)),key=lambda i:cnt[i])
if cnt[mn]<minw:
if mn==0: del keep[0]
elif mn==len(cnt)-1: del keep[-1]
elif cnt[mn-1]<=cnt[mn+1]: del keep[mn-1]
else: del keep[mn]
changed=True
if not keep: return {"type":"single","reason":"thin-merged"}, ws
bounds=[0]+keep+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(bounds)-1)]
return {"type":"multi","cols":cols,"cuts":keep,"ncols":len(cols)}, ws
def emit_linear(pg):
info,ws=detect_regions(pg)
if info["type"]=="single": return pg.extract_text() or ""
cuts=info["cuts"]; cols=info["cols"]; W=float(pg.width)
def colidx(x):
for k,c in enumerate(cols):
if c[0]<=x<c[1]: return k
return len(cols)-1
seq=[]
for ln in _lines(ws):
sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
for i in range(1,len(sw)):
if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
else: frags[-1].append(sw[i])
for fr in frags:
x0=min(w["x0"] for w in fr); x1=max(w["x1"] for w in fr); top=min(w["top"] for w in fr)
text=" ".join(w["text"] for w in fr); spans=sum(1 for c in cuts if x0<c<x1)
seq.append(("full",None,top,text) if spans>=1 else ("col",colidx((x0+x1)/2),top,text))
out=[]; buf=[]
def flush(b):
res=[]
for k in sorted(set(x[1] for x in b)):
for x in sorted([x for x in b if x[1]==k], key=lambda x:x[2]): res.append(x[3])
return res
for it in seq:
if it[0]=="full":
if buf: out+=flush(buf); buf=[]
out.append(it[3])
else: buf.append(it)
if buf: out+=flush(buf)
return "\n".join(out)
def emit_words(ws, W):
flat=lambda L: " ".join(w["text"] for w in sorted(L,key=lambda w:(round(w["top"],1),w["x0"])))
if len(ws)<60: return flat(ws)
cuts=_gutters(ws,W)
if not cuts: return flat(ws)
bounds=[0]+cuts+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(cuts)+1)]
def colidx(x):
for k,c in enumerate(cols):
if c[0]<=x<c[1]: return k
return len(cols)-1
buf={}
for ln in _lines(ws):
sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
for i in range(1,len(sw)):
if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
else: frags[-1].append(sw[i])
for fr in frags:
mid=(min(w["x0"] for w in fr)+max(w["x1"] for w in fr))/2
buf.setdefault(colidx(mid),[]).append((min(w["top"] for w in fr)," ".join(w["text"] for w in fr)))
out=[]
for k in sorted(buf):
for top,t in sorted(buf[k]): out.append(t)
return chr(10).join(out)