feat(control-pipeline): persist SGE Knowledge Compiler (capability execution engine)
Move the proven, behavior-equivalent SGE compiler from the macmini build env into
the repo (release hygiene, no behavior change, no new capability/architecture):
- scripts/sge_build.py — structured guidance extractor + capability execution
engine. main() runs run_engine(): C6 Tables -> C7 ReadingOrder -> C1/C2 Sections
-> C4 References, order derived topologically from each capability consume/produce
contract (not hardcoded). Region ownership: C6 claims tables, C7 residual prose.
- scripts/capability_pipeline.py — Region IR {id,bbox,type,state,owner},
claim/consume/produce capabilities, topological resolve_order().
- scripts/reading_order.py — C7 reading-order reconstruction (multi-column reflow;
identity gate on single-column so output==input).
Verified bit-identical: artifact graph (IDs, parent/child, metadata, text) unchanged
vs the pre-engine direct path across 8 docs (4 layout families x EN/DE,
0 mismatch / 0 only_base / 0 only_engine); Golden degraded=0. BUILD_CP default is now
__file__-relative so the script self-locates control-pipeline/services.
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
"""Capability Execution Engine (Prototyp). Region-IR mit Runtime-Owner; Capabilities
|
||||
deklarieren claims/consumes/produces; Ausführungsreihenfolge wird aus dem Artefakt-Graphen
|
||||
ABGELEITET (topologisch), nicht hartkodiert. Realisiert C6/C7 als Pipeline-Stages mit Region-Ownership."""
|
||||
from dataclasses import dataclass, field
|
||||
import reading_order as RO
|
||||
|
||||
@dataclass
|
||||
class Region:
|
||||
id:int; bbox:tuple; type:str="unknown"; state:str="unclaimed"; owner:str=None
|
||||
|
||||
@dataclass
|
||||
class Artifact:
|
||||
kind:str; payload:object; source_region:int=None
|
||||
|
||||
class Capability:
|
||||
name=""; consumes=[]; produces=[]; residual=False
|
||||
def claims(self, region): return False
|
||||
def run(self, regions, page, artifacts): return []
|
||||
|
||||
class TableExtraction(Capability):
|
||||
name="C6_TableExtraction"; consumes=["table_region"]; produces=["table_units"]
|
||||
def claims(self, region): return region.type=="table"
|
||||
def run(self, regions, page, artifacts):
|
||||
out=[]
|
||||
for r in regions:
|
||||
crop=page.crop(_clamp(r.bbox, page))
|
||||
tbls=crop.find_tables()
|
||||
rows=tbls[0].extract() if tbls else []
|
||||
out.append(Artifact("table_units", {"region":r.id,"rows":len(rows)}, r.id))
|
||||
return out
|
||||
|
||||
class ReadingOrder(Capability):
|
||||
name="C7_ReadingOrder"; consumes=["prose_region"]; produces=["ordered_prose"]; residual=True
|
||||
def claims(self, region): return region.type=="prose"
|
||||
def run(self, regions, page, artifacts):
|
||||
table_bboxes=[a.payload for a in artifacts if a.kind=="_table_bbox"]
|
||||
ws=[w for w in page.extract_words() if not _in_any(w, table_bboxes)]
|
||||
text=RO.emit_words(ws, float(page.width)) if hasattr(RO,"emit_words") else ""
|
||||
return [Artifact("ordered_prose", {"words":len(ws),"chars":len(text)}, None)]
|
||||
|
||||
class FigureExtraction(Capability):
|
||||
name="C8_FigureExtraction"; consumes=["figure_region"]; produces=["figure_units"]
|
||||
def claims(self, region): return region.type=="figure"
|
||||
|
||||
class References(Capability):
|
||||
name="C4_References"; consumes=["ordered_prose"]; produces=["citation_units"]
|
||||
|
||||
def _in_any(w, bboxes):
|
||||
cx=(w["x0"]+w["x1"])/2; cy=(w["top"]+w["bottom"])/2
|
||||
for (x0,t,x1,b) in bboxes:
|
||||
if x0<=cx<=x1 and t<=cy<=b: return True
|
||||
return False
|
||||
|
||||
def _clamp(b, page):
|
||||
x0,t,x1,bt=b
|
||||
return (max(0,x0),max(0,t),min(float(page.width),x1),min(float(page.height),bt))
|
||||
|
||||
def segment(page):
|
||||
regions=[]; rid=0; tbb=[]; W=float(page.width); H=float(page.height)
|
||||
for t in page.find_tables():
|
||||
b=_clamp(t.bbox,page)
|
||||
if (b[2]-b[0])>=0.25*W and (b[3]-b[1])>=25: # substanzielle Tabelle, keine Footer-Artefakte
|
||||
regions.append(Region(rid,b,"table")); tbb.append(b); rid+=1
|
||||
regions.append(Region(rid,(0,0,W,H),"prose")); rid+=1
|
||||
return regions, tbb
|
||||
|
||||
def resolve_order(caps, raw_types):
|
||||
available=set(raw_types); ordered=[]; remaining=list(caps)
|
||||
while remaining:
|
||||
progressed=False
|
||||
for c in list(remaining):
|
||||
if all(dep in available for dep in c.consumes):
|
||||
ordered.append(c); available.update(c.produces); remaining.remove(c); progressed=True
|
||||
if not progressed: raise RuntimeError("unsatisfiable: "+str([c.name for c in remaining]))
|
||||
return ordered
|
||||
|
||||
def run_pipeline(page, caps):
|
||||
regions, tbb=segment(page)
|
||||
raw_types=set(r.type+"_region" for r in regions)
|
||||
order=resolve_order(caps, raw_types)
|
||||
artifacts=[Artifact("_table_bbox",b) for b in tbb] # geometrie für C7-Ausschluss
|
||||
# CLAIM-Phase: spezifische Claimer zuerst, residual zuletzt
|
||||
for c in sorted(order, key=lambda c:c.residual):
|
||||
for r in regions:
|
||||
if r.state=="unclaimed" and c.claims(r):
|
||||
r.state="claimed"; r.owner=c.name
|
||||
# RUN-Phase in abgeleiteter Reihenfolge
|
||||
for c in order:
|
||||
owned=[r for r in regions if r.owner==c.name]
|
||||
if owned or any(dep=="ordered_prose" for dep in c.consumes):
|
||||
artifacts+= c.run(owned, page, artifacts)
|
||||
return regions, [a for a in artifacts if not a.kind.startswith("_")], order
|
||||
Reference in New Issue
Block a user