"""C7 Reading Order Reconstruction (Pilot). ReadingRegion-Modell, Identity-Gate. Scope: NUR Detect Regions / Determine Order / Emit Linear. KEINE Tabellen/Bilder/Sidebars/Fussnoten/Callouts.""" import statistics def _lines(words, ytol=3.0): ws=sorted(words,key=lambda w:(round(w["top"],1),w["x0"])); lines=[]; cur=[]; cy=None for w in ws: if cy is None or abs(w["top"]-cy)<=ytol: cur.append(w); cy=w["top"] if cy is None else cy else: lines.append(cur); cur=[w]; cy=w["top"] if cur: lines.append(cur) return lines def _gutters(words, W): G=160; xs=[W*i/G for i in range(G+1)] cov=[sum(1 for w in words if w["x0"]<=x<=w["x1"]) for x in xs] pos=[c for c in cov if c>0] if not pos: return [] body=statistics.median(pos) if body<6: return [] thr=max(1,0.15*body); gut=[]; i=0 while i<=G: if cov[i]=0.02*W: gut.append((xs[i]+xs[min(j-1,G)])/2) i=j else: i+=1 return gut def detect_regions(pg): ws=pg.extract_words(); W=float(pg.width) if len(ws)<60: return {"type":"single","reason":"sparse"}, ws cuts=_gutters(ws,W) if not cuts: return {"type":"single","reason":"no-gutter"}, ws def rc(a,b): return sum(1 for w in ws if a<=(w["x0"]+w["x1"])/2=1 else ("col",colidx((x0+x1)/2),top,text)) out=[]; buf=[] def flush(b): res=[] for k in sorted(set(x[1] for x in b)): for x in sorted([x for x in b if x[1]==k], key=lambda x:x[2]): res.append(x[3]) return res for it in seq: if it[0]=="full": if buf: out+=flush(buf); buf=[] out.append(it[3]) else: buf.append(it) if buf: out+=flush(buf) return "\n".join(out) def emit_words(ws, W): flat=lambda L: " ".join(w["text"] for w in sorted(L,key=lambda w:(round(w["top"],1),w["x0"]))) if len(ws)<60: return flat(ws) cuts=_gutters(ws,W) if not cuts: return flat(ws) bounds=[0]+cuts+[W]; cols=[(bounds[k],bounds[k+1]) for k in range(len(cuts)+1)] def colidx(x): for k,c in enumerate(cols): if c[0]<=x