Fix: Remove broken getKlausurApiUrl and clean up empty lines
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
Some checks failed
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-school (push) Successful in 42s
CI / test-go-edu-search (push) Successful in 34s
CI / test-python-klausur (push) Failing after 2m51s
CI / test-python-agent-core (push) Successful in 21s
CI / test-nodejs-website (push) Successful in 29s
sed replacement left orphaned hostname references in story page and empty lines in getApiBase functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
441
klausur-service/backend/cv_layout_scoring.py
Normal file
441
klausur-service/backend/cv_layout_scoring.py
Normal file
@@ -0,0 +1,441 @@
|
||||
"""
|
||||
Language scoring, role scoring, and dictionary detection/classification.
|
||||
|
||||
Extracted from cv_layout.py to keep modules under 500 LOC.
|
||||
|
||||
Lizenz: Apache 2.0 (kommerziell nutzbar)
|
||||
DATENSCHUTZ: Alle Verarbeitung erfolgt lokal.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from cv_vocab_types import (
|
||||
ColumnGeometry,
|
||||
ENGLISH_FUNCTION_WORDS,
|
||||
GERMAN_FUNCTION_WORDS,
|
||||
PageRegion,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Dictionary / Wörterbuch Detection ---
|
||||
|
||||
# Article words that appear as a dedicated column in dictionaries
|
||||
_DICT_ARTICLE_WORDS = {
|
||||
# German articles
|
||||
"die", "der", "das", "dem", "den", "des", "ein", "eine", "einem", "einer",
|
||||
# English articles / infinitive marker
|
||||
"the", "a", "an", "to",
|
||||
}
|
||||
|
||||
|
||||
# --- Phase B: Content-Based Classification ---
|
||||
|
||||
def _score_language(words: List[Dict]) -> Dict[str, float]:
|
||||
"""Score the language of a column's words.
|
||||
|
||||
Analyzes function words, umlauts, and capitalization patterns
|
||||
to determine whether text is English or German.
|
||||
|
||||
Args:
|
||||
words: List of word dicts with 'text' and 'conf' keys.
|
||||
|
||||
Returns:
|
||||
Dict with 'eng' and 'deu' scores (0.0-1.0).
|
||||
"""
|
||||
if not words:
|
||||
return {'eng': 0.0, 'deu': 0.0}
|
||||
|
||||
# Only consider words with decent confidence
|
||||
good_words = [w['text'].lower() for w in words if w.get('conf', 0) > 40 and len(w['text']) > 0]
|
||||
if not good_words:
|
||||
return {'eng': 0.0, 'deu': 0.0}
|
||||
|
||||
total = len(good_words)
|
||||
en_hits = sum(1 for w in good_words if w in ENGLISH_FUNCTION_WORDS)
|
||||
de_hits = sum(1 for w in good_words if w in GERMAN_FUNCTION_WORDS)
|
||||
|
||||
# Check for umlauts (strong German signal)
|
||||
raw_texts = [w['text'] for w in words if w.get('conf', 0) > 40]
|
||||
umlaut_count = sum(1 for t in raw_texts
|
||||
for c in t if c in 'äöüÄÖÜß')
|
||||
|
||||
# German capitalization: nouns are capitalized mid-sentence
|
||||
# Count words that start with uppercase but aren't at position 0
|
||||
cap_words = sum(1 for t in raw_texts if t[0].isupper() and len(t) > 2)
|
||||
|
||||
en_score = en_hits / total if total > 0 else 0.0
|
||||
de_score = de_hits / total if total > 0 else 0.0
|
||||
|
||||
# Boost German score for umlauts
|
||||
if umlaut_count > 0:
|
||||
de_score = min(1.0, de_score + 0.15 * min(umlaut_count, 5))
|
||||
|
||||
# Boost German score for high capitalization ratio (typical for German nouns)
|
||||
if total > 5:
|
||||
cap_ratio = cap_words / total
|
||||
if cap_ratio > 0.3:
|
||||
de_score = min(1.0, de_score + 0.1)
|
||||
|
||||
return {'eng': round(en_score, 3), 'deu': round(de_score, 3)}
|
||||
|
||||
|
||||
def _score_role(geom: ColumnGeometry) -> Dict[str, float]:
|
||||
"""Score the role of a column based on its geometry and content patterns.
|
||||
|
||||
Args:
|
||||
geom: ColumnGeometry with words and dimensions.
|
||||
|
||||
Returns:
|
||||
Dict with role scores: 'reference', 'marker', 'sentence', 'vocabulary'.
|
||||
"""
|
||||
scores = {'reference': 0.0, 'marker': 0.0, 'sentence': 0.0, 'vocabulary': 0.0}
|
||||
|
||||
if not geom.words:
|
||||
return scores
|
||||
|
||||
texts = [w['text'] for w in geom.words if w.get('conf', 0) > 40]
|
||||
if not texts:
|
||||
return scores
|
||||
|
||||
avg_word_len = sum(len(t) for t in texts) / len(texts)
|
||||
has_punctuation = sum(1 for t in texts if any(c in t for c in '.!?;:,'))
|
||||
digit_words = sum(1 for t in texts if any(c.isdigit() for c in t))
|
||||
digit_ratio = digit_words / len(texts) if texts else 0.0
|
||||
|
||||
# Reference: narrow + mostly numbers/page references
|
||||
if geom.width_ratio < 0.12:
|
||||
scores['reference'] = 0.5
|
||||
if digit_ratio > 0.4:
|
||||
scores['reference'] = min(1.0, 0.5 + digit_ratio * 0.5)
|
||||
|
||||
# Marker: narrow + few short entries
|
||||
if geom.width_ratio < 0.06 and geom.word_count <= 15:
|
||||
scores['marker'] = 0.7
|
||||
if avg_word_len < 4:
|
||||
scores['marker'] = 0.9
|
||||
# Very narrow non-edge column → strong marker regardless of word count
|
||||
if geom.width_ratio < 0.04 and geom.index > 0:
|
||||
scores['marker'] = max(scores['marker'], 0.9)
|
||||
|
||||
# Sentence: longer words + punctuation present
|
||||
if geom.width_ratio > 0.15 and has_punctuation > 2:
|
||||
scores['sentence'] = 0.3 + min(0.5, has_punctuation / len(texts))
|
||||
if avg_word_len > 4:
|
||||
scores['sentence'] = min(1.0, scores['sentence'] + 0.2)
|
||||
|
||||
# Vocabulary: medium width + medium word length
|
||||
if 0.10 < geom.width_ratio < 0.45:
|
||||
scores['vocabulary'] = 0.4
|
||||
if 3 < avg_word_len < 8:
|
||||
scores['vocabulary'] = min(1.0, scores['vocabulary'] + 0.3)
|
||||
|
||||
return {k: round(v, 3) for k, v in scores.items()}
|
||||
|
||||
|
||||
def _score_dictionary_signals(
|
||||
geometries: List[ColumnGeometry],
|
||||
document_category: Optional[str] = None,
|
||||
margin_strip_detected: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Score dictionary-specific patterns across all columns.
|
||||
|
||||
Combines 4 independent signals to determine if the page is a dictionary:
|
||||
1. Alphabetical ordering of words in each column
|
||||
2. Article column detection (der/die/das, to)
|
||||
3. First-letter uniformity (most headwords share a letter)
|
||||
4. Decorative A-Z margin strip (detected upstream)
|
||||
|
||||
Args:
|
||||
geometries: List of ColumnGeometry with words.
|
||||
document_category: User-selected category (e.g. 'woerterbuch').
|
||||
margin_strip_detected: Whether a decorative A-Z margin strip was found.
|
||||
|
||||
Returns:
|
||||
Dict with 'is_dictionary', 'confidence', 'article_col_index',
|
||||
'headword_col_index', and 'signals' sub-dict.
|
||||
"""
|
||||
result: Dict[str, Any] = {
|
||||
"is_dictionary": False,
|
||||
"confidence": 0.0,
|
||||
"article_col_index": None,
|
||||
"headword_col_index": None,
|
||||
"signals": {},
|
||||
}
|
||||
|
||||
if not geometries or len(geometries) < 2:
|
||||
return result
|
||||
|
||||
# --- Signal 1: Alphabetical ordering per column (weight 0.35) ---
|
||||
best_alpha_score = 0.0
|
||||
best_alpha_col = -1
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
|
||||
]
|
||||
if len(texts) < 5:
|
||||
continue
|
||||
# Deduplicate consecutive identical words (OCR double-reads)
|
||||
deduped = [texts[0]]
|
||||
for t in texts[1:]:
|
||||
if t != deduped[-1]:
|
||||
deduped.append(t)
|
||||
if len(deduped) < 5:
|
||||
continue
|
||||
# Count consecutive pairs in alphabetical order
|
||||
ordered_pairs = sum(
|
||||
1 for i in range(len(deduped) - 1)
|
||||
if deduped[i] <= deduped[i + 1]
|
||||
)
|
||||
alpha_score = ordered_pairs / (len(deduped) - 1)
|
||||
if alpha_score > best_alpha_score:
|
||||
best_alpha_score = alpha_score
|
||||
best_alpha_col = geom.index
|
||||
|
||||
result["signals"]["alphabetical_score"] = round(best_alpha_score, 3)
|
||||
result["signals"]["alphabetical_col"] = best_alpha_col
|
||||
|
||||
# --- Signal 2: Article detection (weight 0.25) ---
|
||||
# Check three patterns:
|
||||
# (a) Dedicated narrow article column (der/die/das only)
|
||||
# (b) Inline articles: multi-word texts starting with "der X", "die X"
|
||||
# (c) High article word frequency: many individual words ARE articles
|
||||
# (common when OCR splits "der Zustand" into separate word_boxes)
|
||||
best_article_density = 0.0
|
||||
best_article_col = -1
|
||||
best_inline_article_ratio = 0.0
|
||||
best_article_word_ratio = 0.0
|
||||
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in geom.words
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) > 0
|
||||
]
|
||||
if len(texts) < 3:
|
||||
continue
|
||||
|
||||
# (a) Dedicated article column: narrow, mostly article words
|
||||
article_count = sum(1 for t in texts if t in _DICT_ARTICLE_WORDS)
|
||||
if geom.width_ratio <= 0.20:
|
||||
density = article_count / len(texts)
|
||||
if density > best_article_density:
|
||||
best_article_density = density
|
||||
best_article_col = geom.index
|
||||
|
||||
# (b) Inline articles: "der Zustand", "die Zutat", etc.
|
||||
inline_count = sum(
|
||||
1 for t in texts
|
||||
if any(t.startswith(art + " ") for art in _DICT_ARTICLE_WORDS)
|
||||
)
|
||||
inline_ratio = inline_count / len(texts)
|
||||
if inline_ratio > best_inline_article_ratio:
|
||||
best_inline_article_ratio = inline_ratio
|
||||
|
||||
# (c) Article word frequency in any column (for OCR-split word_boxes)
|
||||
# In dictionaries, articles appear frequently among headwords
|
||||
# Require at least 10% articles and >= 3 article words
|
||||
if article_count >= 3:
|
||||
art_ratio = article_count / len(texts)
|
||||
# Only count if column has enough non-article words too
|
||||
# (pure article column is handled by (a))
|
||||
non_art = len(texts) - article_count
|
||||
if non_art >= 3 and art_ratio > best_article_word_ratio:
|
||||
best_article_word_ratio = art_ratio
|
||||
|
||||
# Use the strongest signal
|
||||
effective_article_score = max(
|
||||
best_article_density,
|
||||
best_inline_article_ratio,
|
||||
best_article_word_ratio * 0.8, # slight discount for raw word ratio
|
||||
)
|
||||
|
||||
result["signals"]["article_density"] = round(best_article_density, 3)
|
||||
result["signals"]["inline_article_ratio"] = round(best_inline_article_ratio, 3)
|
||||
result["signals"]["article_word_ratio"] = round(best_article_word_ratio, 3)
|
||||
result["signals"]["article_col"] = best_article_col
|
||||
|
||||
# --- Signal 3: First-letter uniformity (weight 0.25) ---
|
||||
best_uniformity = 0.0
|
||||
best_uniform_col = -1
|
||||
has_letter_transition = False
|
||||
for geom in geometries:
|
||||
texts = [
|
||||
w["text"].strip().lower()
|
||||
for w in sorted(geom.words, key=lambda w: w.get("top", 0))
|
||||
if w.get("conf", 0) > 30 and len(w["text"].strip()) >= 2
|
||||
]
|
||||
if len(texts) < 5:
|
||||
continue
|
||||
# Count first letters
|
||||
first_letters = [t[0] for t in texts if t[0].isalpha()]
|
||||
if not first_letters:
|
||||
continue
|
||||
letter_counts = Counter(first_letters)
|
||||
most_common_letter, most_common_count = letter_counts.most_common(1)[0]
|
||||
uniformity = most_common_count / len(first_letters)
|
||||
|
||||
# Check for orderly letter transitions (A→B or Y→Z)
|
||||
# Group consecutive words by first letter, check if groups are in order
|
||||
groups = []
|
||||
current_letter = first_letters[0]
|
||||
for fl in first_letters:
|
||||
if fl != current_letter:
|
||||
groups.append(current_letter)
|
||||
current_letter = fl
|
||||
groups.append(current_letter)
|
||||
if len(groups) >= 2 and len(groups) <= 5:
|
||||
# Check if groups are alphabetically ordered
|
||||
if all(groups[i] <= groups[i + 1] for i in range(len(groups) - 1)):
|
||||
has_letter_transition = True
|
||||
# Boost uniformity for orderly transitions
|
||||
uniformity = max(uniformity, 0.70)
|
||||
|
||||
if uniformity > best_uniformity:
|
||||
best_uniformity = uniformity
|
||||
best_uniform_col = geom.index
|
||||
|
||||
result["signals"]["first_letter_uniformity"] = round(best_uniformity, 3)
|
||||
result["signals"]["uniform_col"] = best_uniform_col
|
||||
result["signals"]["has_letter_transition"] = has_letter_transition
|
||||
|
||||
# --- Signal 4: Decorative margin strip (weight 0.15) ---
|
||||
result["signals"]["margin_strip_detected"] = margin_strip_detected
|
||||
|
||||
# --- Combine signals ---
|
||||
s1 = min(best_alpha_score, 1.0) * 0.35
|
||||
s2 = min(effective_article_score, 1.0) * 0.25
|
||||
s3 = min(best_uniformity, 1.0) * 0.25
|
||||
s4 = (1.0 if margin_strip_detected else 0.0) * 0.15
|
||||
|
||||
combined = s1 + s2 + s3 + s4
|
||||
|
||||
# Boost if user set document_category to 'woerterbuch'
|
||||
if document_category == "woerterbuch":
|
||||
combined = min(1.0, combined + 0.20)
|
||||
result["signals"]["category_boost"] = True
|
||||
|
||||
result["confidence"] = round(combined, 3)
|
||||
|
||||
# Threshold: combined >= 0.40 to classify as dictionary
|
||||
# (at least 2 strong signals or 3 moderate ones)
|
||||
if combined >= 0.40:
|
||||
result["is_dictionary"] = True
|
||||
# Identify headword column: best alphabetical OR best uniform
|
||||
if best_alpha_col >= 0 and best_alpha_score >= 0.60:
|
||||
result["headword_col_index"] = best_alpha_col
|
||||
elif best_uniform_col >= 0 and best_uniformity >= 0.50:
|
||||
result["headword_col_index"] = best_uniform_col
|
||||
if best_article_col >= 0 and best_article_density >= 0.30:
|
||||
result["article_col_index"] = best_article_col
|
||||
# If inline articles are strong but no dedicated column, note it
|
||||
if best_inline_article_ratio >= 0.30 and result["article_col_index"] is None:
|
||||
result["signals"]["inline_articles_detected"] = True
|
||||
|
||||
logger.info(
|
||||
"DictionaryDetection: combined=%.3f is_dict=%s signals=%s",
|
||||
combined, result["is_dictionary"], result["signals"],
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _classify_dictionary_columns(
|
||||
geometries: List[ColumnGeometry],
|
||||
dict_signals: Dict[str, Any],
|
||||
lang_scores: List[Dict[str, float]],
|
||||
content_h: int,
|
||||
) -> Optional[List[PageRegion]]:
|
||||
"""Classify columns for a detected dictionary page.
|
||||
|
||||
Assigns column_headword, column_article, column_ipa, and
|
||||
column_de/column_en based on dictionary signals and language scores.
|
||||
|
||||
Returns None if classification fails.
|
||||
"""
|
||||
if not dict_signals.get("is_dictionary"):
|
||||
return None
|
||||
|
||||
regions: List[PageRegion] = []
|
||||
assigned = set()
|
||||
article_idx = dict_signals.get("article_col_index")
|
||||
headword_idx = dict_signals.get("headword_col_index")
|
||||
|
||||
# 1. Assign article column if detected
|
||||
if article_idx is not None:
|
||||
for geom in geometries:
|
||||
if geom.index == article_idx:
|
||||
regions.append(PageRegion(
|
||||
type="column_article",
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=round(
|
||||
dict_signals["signals"].get("article_density", 0.5), 2),
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
break
|
||||
|
||||
# 2. Assign headword column
|
||||
if headword_idx is not None and headword_idx not in assigned:
|
||||
for geom in geometries:
|
||||
if geom.index == headword_idx:
|
||||
regions.append(PageRegion(
|
||||
type="column_headword",
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=round(
|
||||
dict_signals["confidence"], 2),
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
break
|
||||
|
||||
# 3. Assign remaining columns by language + content
|
||||
remaining = [g for g in geometries if g.index not in assigned]
|
||||
for geom in remaining:
|
||||
ls = lang_scores[geom.index] if geom.index < len(lang_scores) else {"eng": 0, "deu": 0}
|
||||
|
||||
# Check if column contains IPA (brackets like [, /, ˈ)
|
||||
ipa_chars = sum(
|
||||
1 for w in geom.words
|
||||
if any(c in (w.get("text") or "") for c in "[]/ˈˌːɪəɒʊæɑɔ")
|
||||
)
|
||||
ipa_ratio = ipa_chars / max(len(geom.words), 1)
|
||||
|
||||
if ipa_ratio > 0.25:
|
||||
col_type = "column_ipa"
|
||||
conf = round(min(1.0, ipa_ratio), 2)
|
||||
elif ls["deu"] > ls["eng"] and ls["deu"] > 0.05:
|
||||
col_type = "column_de"
|
||||
conf = round(ls["deu"], 2)
|
||||
elif ls["eng"] > ls["deu"] and ls["eng"] > 0.05:
|
||||
col_type = "column_en"
|
||||
conf = round(ls["eng"], 2)
|
||||
else:
|
||||
# Positional fallback: leftmost unassigned = EN, next = DE
|
||||
left_unassigned = sorted(
|
||||
[g for g in remaining if g.index not in assigned],
|
||||
key=lambda g: g.x,
|
||||
)
|
||||
if geom == left_unassigned[0] if left_unassigned else None:
|
||||
col_type = "column_en"
|
||||
else:
|
||||
col_type = "column_de"
|
||||
conf = 0.4
|
||||
|
||||
regions.append(PageRegion(
|
||||
type=col_type,
|
||||
x=geom.x, y=geom.y,
|
||||
width=geom.width, height=content_h,
|
||||
classification_confidence=conf,
|
||||
classification_method="dictionary",
|
||||
))
|
||||
assigned.add(geom.index)
|
||||
|
||||
regions.sort(key=lambda r: r.x)
|
||||
return regions
|
||||
Reference in New Issue
Block a user