feat(control-pipeline): persist SGE Knowledge Compiler (capability execution engine)
Move the proven, behavior-equivalent SGE compiler from the macmini build env into
the repo (release hygiene, no behavior change, no new capability/architecture):
- scripts/sge_build.py — structured guidance extractor + capability execution
engine. main() runs run_engine(): C6 Tables -> C7 ReadingOrder -> C1/C2 Sections
-> C4 References, order derived topologically from each capability consume/produce
contract (not hardcoded). Region ownership: C6 claims tables, C7 residual prose.
- scripts/capability_pipeline.py — Region IR {id,bbox,type,state,owner},
claim/consume/produce capabilities, topological resolve_order().
- scripts/reading_order.py — C7 reading-order reconstruction (multi-column reflow;
identity gate on single-column so output==input).
Verified bit-identical: artifact graph (IDs, parent/child, metadata, text) unchanged
vs the pre-engine direct path across 8 docs (4 layout families x EN/DE,
0 mismatch / 0 only_base / 0 only_engine); Golden degraded=0. BUILD_CP default is now
__file__-relative so the script self-locates control-pipeline/services.
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
"""C7 Reading Order Reconstruction (Pilot). ReadingRegion-Modell, Identity-Gate.
|
||||
Scope: NUR Detect Regions / Determine Order / Emit Linear. KEINE Tabellen/Bilder/Sidebars/Fussnoten/Callouts."""
|
||||
import statistics
|
||||
|
||||
def _lines(words, ytol=3.0):
|
||||
ws=sorted(words,key=lambda w:(round(w["top"],1),w["x0"])); lines=[]; cur=[]; cy=None
|
||||
for w in ws:
|
||||
if cy is None or abs(w["top"]-cy)<=ytol: cur.append(w); cy=w["top"] if cy is None else cy
|
||||
else: lines.append(cur); cur=[w]; cy=w["top"]
|
||||
if cur: lines.append(cur)
|
||||
return lines
|
||||
|
||||
def _gutters(words, W):
|
||||
G=160; xs=[W*i/G for i in range(G+1)]
|
||||
cov=[sum(1 for w in words if w["x0"]<=x<=w["x1"]) for x in xs]
|
||||
pos=[c for c in cov if c>0]
|
||||
if not pos: return []
|
||||
body=statistics.median(pos)
|
||||
if body<6: return []
|
||||
thr=max(1,0.15*body); gut=[]; i=0
|
||||
while i<=G:
|
||||
if cov[i]<thr and 0.15*W<=xs[i]<=0.85*W:
|
||||
j=i
|
||||
while j<=G and cov[j]<thr: j+=1
|
||||
if xs[min(j,G)]-xs[i]>=0.02*W: gut.append((xs[i]+xs[min(j-1,G)])/2)
|
||||
i=j
|
||||
else: i+=1
|
||||
return gut
|
||||
|
||||
def detect_regions(pg):
|
||||
ws=pg.extract_words(); W=float(pg.width)
|
||||
if len(ws)<60: return {"type":"single","reason":"sparse"}, ws
|
||||
cuts=_gutters(ws,W)
|
||||
if not cuts: return {"type":"single","reason":"no-gutter"}, ws
|
||||
def rc(a,b): return sum(1 for w in ws if a<=(w["x0"]+w["x1"])/2<b)
|
||||
minw=max(25,0.12*len(ws))
|
||||
keep=list(cuts); changed=True
|
||||
while keep and changed:
|
||||
changed=False; bnds=[0]+keep+[W]
|
||||
cnt=[rc(bnds[i],bnds[i+1]) for i in range(len(bnds)-1)]
|
||||
mn=min(range(len(cnt)),key=lambda i:cnt[i])
|
||||
if cnt[mn]<minw:
|
||||
if mn==0: del keep[0]
|
||||
elif mn==len(cnt)-1: del keep[-1]
|
||||
elif cnt[mn-1]<=cnt[mn+1]: del keep[mn-1]
|
||||
else: del keep[mn]
|
||||
changed=True
|
||||
if not keep: return {"type":"single","reason":"thin-merged"}, ws
|
||||
bounds=[0]+keep+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(bounds)-1)]
|
||||
return {"type":"multi","cols":cols,"cuts":keep,"ncols":len(cols)}, ws
|
||||
|
||||
def emit_linear(pg):
|
||||
info,ws=detect_regions(pg)
|
||||
if info["type"]=="single": return pg.extract_text() or ""
|
||||
cuts=info["cuts"]; cols=info["cols"]; W=float(pg.width)
|
||||
def colidx(x):
|
||||
for k,c in enumerate(cols):
|
||||
if c[0]<=x<c[1]: return k
|
||||
return len(cols)-1
|
||||
seq=[]
|
||||
for ln in _lines(ws):
|
||||
sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
|
||||
for i in range(1,len(sw)):
|
||||
if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
|
||||
else: frags[-1].append(sw[i])
|
||||
for fr in frags:
|
||||
x0=min(w["x0"] for w in fr); x1=max(w["x1"] for w in fr); top=min(w["top"] for w in fr)
|
||||
text=" ".join(w["text"] for w in fr); spans=sum(1 for c in cuts if x0<c<x1)
|
||||
seq.append(("full",None,top,text) if spans>=1 else ("col",colidx((x0+x1)/2),top,text))
|
||||
out=[]; buf=[]
|
||||
def flush(b):
|
||||
res=[]
|
||||
for k in sorted(set(x[1] for x in b)):
|
||||
for x in sorted([x for x in b if x[1]==k], key=lambda x:x[2]): res.append(x[3])
|
||||
return res
|
||||
for it in seq:
|
||||
if it[0]=="full":
|
||||
if buf: out+=flush(buf); buf=[]
|
||||
out.append(it[3])
|
||||
else: buf.append(it)
|
||||
if buf: out+=flush(buf)
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def emit_words(ws, W):
|
||||
flat=lambda L: " ".join(w["text"] for w in sorted(L,key=lambda w:(round(w["top"],1),w["x0"])))
|
||||
if len(ws)<60: return flat(ws)
|
||||
cuts=_gutters(ws,W)
|
||||
if not cuts: return flat(ws)
|
||||
bounds=[0]+cuts+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(cuts)+1)]
|
||||
def colidx(x):
|
||||
for k,c in enumerate(cols):
|
||||
if c[0]<=x<c[1]: return k
|
||||
return len(cols)-1
|
||||
buf={}
|
||||
for ln in _lines(ws):
|
||||
sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
|
||||
for i in range(1,len(sw)):
|
||||
if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
|
||||
else: frags[-1].append(sw[i])
|
||||
for fr in frags:
|
||||
mid=(min(w["x0"] for w in fr)+max(w["x1"] for w in fr))/2
|
||||
buf.setdefault(colidx(mid),[]).append((min(w["top"] for w in fr)," ".join(w["text"] for w in fr)))
|
||||
out=[]
|
||||
for k in sorted(buf):
|
||||
for top,t in sorted(buf[k]): out.append(t)
|
||||
return chr(10).join(out)
|
||||
Reference in New Issue
Block a user