a8412e3db7
Move the proven, behavior-equivalent SGE compiler from the macmini build env into
the repo (release hygiene, no behavior change, no new capability/architecture):
- scripts/sge_build.py — structured guidance extractor + capability execution
engine. main() runs run_engine(): C6 Tables -> C7 ReadingOrder -> C1/C2 Sections
-> C4 References, order derived topologically from each capability consume/produce
contract (not hardcoded). Region ownership: C6 claims tables, C7 residual prose.
- scripts/capability_pipeline.py — Region IR {id,bbox,type,state,owner},
claim/consume/produce capabilities, topological resolve_order().
- scripts/reading_order.py — C7 reading-order reconstruction (multi-column reflow;
identity gate on single-column so output==input).
Verified bit-identical: artifact graph (IDs, parent/child, metadata, text) unchanged
vs the pre-engine direct path across 8 docs (4 layout families x EN/DE,
0 mismatch / 0 only_base / 0 only_engine); Golden degraded=0. BUILD_CP default is now
__file__-relative so the script self-locates control-pipeline/services.
108 lines
4.3 KiB
Python
108 lines
4.3 KiB
Python
"""C7 Reading Order Reconstruction (Pilot). ReadingRegion-Modell, Identity-Gate.
|
|
Scope: NUR Detect Regions / Determine Order / Emit Linear. KEINE Tabellen/Bilder/Sidebars/Fussnoten/Callouts."""
|
|
import statistics
|
|
|
|
def _lines(words, ytol=3.0):
|
|
ws=sorted(words,key=lambda w:(round(w["top"],1),w["x0"])); lines=[]; cur=[]; cy=None
|
|
for w in ws:
|
|
if cy is None or abs(w["top"]-cy)<=ytol: cur.append(w); cy=w["top"] if cy is None else cy
|
|
else: lines.append(cur); cur=[w]; cy=w["top"]
|
|
if cur: lines.append(cur)
|
|
return lines
|
|
|
|
def _gutters(words, W):
|
|
G=160; xs=[W*i/G for i in range(G+1)]
|
|
cov=[sum(1 for w in words if w["x0"]<=x<=w["x1"]) for x in xs]
|
|
pos=[c for c in cov if c>0]
|
|
if not pos: return []
|
|
body=statistics.median(pos)
|
|
if body<6: return []
|
|
thr=max(1,0.15*body); gut=[]; i=0
|
|
while i<=G:
|
|
if cov[i]<thr and 0.15*W<=xs[i]<=0.85*W:
|
|
j=i
|
|
while j<=G and cov[j]<thr: j+=1
|
|
if xs[min(j,G)]-xs[i]>=0.02*W: gut.append((xs[i]+xs[min(j-1,G)])/2)
|
|
i=j
|
|
else: i+=1
|
|
return gut
|
|
|
|
def detect_regions(pg):
|
|
ws=pg.extract_words(); W=float(pg.width)
|
|
if len(ws)<60: return {"type":"single","reason":"sparse"}, ws
|
|
cuts=_gutters(ws,W)
|
|
if not cuts: return {"type":"single","reason":"no-gutter"}, ws
|
|
def rc(a,b): return sum(1 for w in ws if a<=(w["x0"]+w["x1"])/2<b)
|
|
minw=max(25,0.12*len(ws))
|
|
keep=list(cuts); changed=True
|
|
while keep and changed:
|
|
changed=False; bnds=[0]+keep+[W]
|
|
cnt=[rc(bnds[i],bnds[i+1]) for i in range(len(bnds)-1)]
|
|
mn=min(range(len(cnt)),key=lambda i:cnt[i])
|
|
if cnt[mn]<minw:
|
|
if mn==0: del keep[0]
|
|
elif mn==len(cnt)-1: del keep[-1]
|
|
elif cnt[mn-1]<=cnt[mn+1]: del keep[mn-1]
|
|
else: del keep[mn]
|
|
changed=True
|
|
if not keep: return {"type":"single","reason":"thin-merged"}, ws
|
|
bounds=[0]+keep+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(bounds)-1)]
|
|
return {"type":"multi","cols":cols,"cuts":keep,"ncols":len(cols)}, ws
|
|
|
|
def emit_linear(pg):
|
|
info,ws=detect_regions(pg)
|
|
if info["type"]=="single": return pg.extract_text() or ""
|
|
cuts=info["cuts"]; cols=info["cols"]; W=float(pg.width)
|
|
def colidx(x):
|
|
for k,c in enumerate(cols):
|
|
if c[0]<=x<c[1]: return k
|
|
return len(cols)-1
|
|
seq=[]
|
|
for ln in _lines(ws):
|
|
sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
|
|
for i in range(1,len(sw)):
|
|
if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
|
|
else: frags[-1].append(sw[i])
|
|
for fr in frags:
|
|
x0=min(w["x0"] for w in fr); x1=max(w["x1"] for w in fr); top=min(w["top"] for w in fr)
|
|
text=" ".join(w["text"] for w in fr); spans=sum(1 for c in cuts if x0<c<x1)
|
|
seq.append(("full",None,top,text) if spans>=1 else ("col",colidx((x0+x1)/2),top,text))
|
|
out=[]; buf=[]
|
|
def flush(b):
|
|
res=[]
|
|
for k in sorted(set(x[1] for x in b)):
|
|
for x in sorted([x for x in b if x[1]==k], key=lambda x:x[2]): res.append(x[3])
|
|
return res
|
|
for it in seq:
|
|
if it[0]=="full":
|
|
if buf: out+=flush(buf); buf=[]
|
|
out.append(it[3])
|
|
else: buf.append(it)
|
|
if buf: out+=flush(buf)
|
|
return "\n".join(out)
|
|
|
|
|
|
def emit_words(ws, W):
|
|
flat=lambda L: " ".join(w["text"] for w in sorted(L,key=lambda w:(round(w["top"],1),w["x0"])))
|
|
if len(ws)<60: return flat(ws)
|
|
cuts=_gutters(ws,W)
|
|
if not cuts: return flat(ws)
|
|
bounds=[0]+cuts+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(cuts)+1)]
|
|
def colidx(x):
|
|
for k,c in enumerate(cols):
|
|
if c[0]<=x<c[1]: return k
|
|
return len(cols)-1
|
|
buf={}
|
|
for ln in _lines(ws):
|
|
sw=sorted(ln,key=lambda w:w["x0"]); frags=[[sw[0]]]
|
|
for i in range(1,len(sw)):
|
|
if any(sw[i-1]["x1"]<=c<=sw[i]["x0"] for c in cuts): frags.append([sw[i]])
|
|
else: frags[-1].append(sw[i])
|
|
for fr in frags:
|
|
mid=(min(w["x0"] for w in fr)+max(w["x1"] for w in fr))/2
|
|
buf.setdefault(colidx(mid),[]).append((min(w["top"] for w in fr)," ".join(w["text"] for w in fr)))
|
|
out=[]
|
|
for k in sorted(buf):
|
|
for top,t in sorted(buf[k]): out.append(t)
|
|
return chr(10).join(out)
|