headroomlabs-ai · lifeodyssey · Jun 30, 2026
@@ -70,6 +70,36 @@ use md5::{Digest, Md5};
 
 use crate::ccr::CcrStore;
 use crate::signals::{ImportanceContext, LineImportanceDetector};
+
+/// True for CJK ideographs, kana, and Hangul. Code-point ranges kept
+/// byte-identical with the Python `_is_cjk_char` for search-compressor parity.
+fn is_cjk_char(c: char) -> bool {
+    matches!(
+        c as u32,
+        0x3040..=0x30FF | 0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xAC00..=0xD7AF | 0xF900..=0xFAFF
+    )
+}
+
+/// CJK character bigrams from the CJK runs of a (lowercased) query, so a
+/// spaceless CJK query can match content. Mirrors the Python `_cjk_bigrams`.
+fn cjk_bigrams(text: &str) -> BTreeSet<String> {
+    let mut out = BTreeSet::new();
+    let mut run: Vec<char> = Vec::new();
+    for c in text.chars() {
+        if is_cjk_char(c) {
+            run.push(c);
+        } else {
+            for w in run.windows(2) {
+                out.insert(w.iter().collect::<String>());
+            }
+            run.clear();
+        }
+    }
+    for w in run.windows(2) {
+        out.insert(w.iter().collect::<String>());
+    }
+    out
+}
 use crate::transforms::adaptive_sizer::compute_optimal_k;
 
 // ─── Types ──────────────────────────────────────────────────────────────
@@ -363,18 +393,23 @@ impl SearchCompressor {
 
     pub fn score_matches(&self, files: &mut BTreeMap<String, FileMatches>, context: &str) {
         let context_lower = context.to_ascii_lowercase();
-        let context_words: Vec<&str> = context_lower
+        // Dedup like Python's `set`; count length in CHARS (not bytes) to match
+        // Python codepoints; and add CJK char bigrams so a spaceless CJK query
+        // (no whitespace words to split on) can still match content.
+        let mut context_words: BTreeSet<String> = context_lower
             .split_whitespace()
-            .filter(|w| w.len() > 2)
+            .filter(|w| w.chars().count() > 2)
+            .map(|w| w.to_string())
             .collect();
+        context_words.extend(cjk_bigrams(&context_lower));
 
         for fm in files.values_mut() {
             for m in &mut fm.matches {
                 let mut score: f32 = 0.0;
                 let content_lower = m.content.to_ascii_lowercase();
 
                 for w in &context_words {
-                    if content_lower.contains(w) {
+                    if content_lower.contains(w.as_str()) {
                         score += 0.3;
                     }
                 }
@@ -682,6 +717,14 @@ mod tests {
         );
     }
 
+    #[test]
+    fn cjk_bigrams_from_runs() {
+        let b = cjk_bigrams("认证令牌");
+        assert!(b.contains("认证") && b.contains("证令") && b.contains("令牌") && b.len() == 3);
+        assert!(cjk_bigrams("hello").is_empty());
+        assert!(cjk_bigrams("a认b证").is_empty()); // isolated CJK chars -> no pair
+    }
+
     #[test]
     fn parses_ripgrep_context_line() {
         assert_eq!(

@@ -50,6 +50,36 @@
 logger = logging.getLogger(__name__)
 
 
+def _is_cjk_char(c: str) -> bool:
+    """True for CJK ideographs, kana, and Hangul. Code-point ranges kept
+    byte-identical with the Rust `is_cjk_char` for search-compressor parity."""
+    o = ord(c)
+    return (
+        0x3040 <= o <= 0x30FF
+        or 0x3400 <= o <= 0x4DBF
+        or 0x4E00 <= o <= 0x9FFF
+        or 0xAC00 <= o <= 0xD7AF
+        or 0xF900 <= o <= 0xFAFF
+    )
+
+
+def _cjk_bigrams(text: str) -> set[str]:
+    """CJK character bigrams from the CJK runs of a (lowercased) query, so a
+    spaceless CJK query can match content. Mirrors the Rust `cjk_bigrams`."""
+    out: set[str] = set()
+    run: list[str] = []
+    for c in text:
+        if _is_cjk_char(c):
+            run.append(c)
+        else:
+            for i in range(len(run) - 1):
+                out.add(run[i] + run[i + 1])
+            run = []
+    for i in range(len(run) - 1):
+        out.add(run[i] + run[i + 1])
+    return out
+
+
 # ─── Public dataclasses (preserve existing import surface) ──────────────────
 
 
@@ -223,14 +253,21 @@ def _score_matches(
 
         Stays Python so the legacy direct-call test surface keeps
         working without rebuilding through Rust on every test. The
-        scoring constants must mirror Rust `SearchCompressor::score_matches`
-        — Rust unit tests pin Rust's behavior; the parity assertion at
-        the bottom of this module pins both sides agree.
+        scoring constants mirror Rust `SearchCompressor::score_matches`,
+        pinned by Rust unit tests and Python tests over the same inputs:
+        word-overlap and CJK-bigram scoring are byte-equal. (The error-
+        boost keyword set still diverges for a few terms fixed only on
+        the Rust side -- see keyword_detector; there is no cross-impl
+        assertion, so this equality is test-pinned, not mechanically
+        enforced.)
         """
         from headroom.transforms.error_detection import PRIORITY_PATTERNS_SEARCH
 
         context_lower = context.lower()
+        # Dedup whitespace words (len>2 by codepoints), and add CJK char bigrams
+        # so a spaceless CJK query can match content.
         context_words = {w for w in context_lower.split() if len(w) > 2}
+        context_words |= _cjk_bigrams(context_lower)
 
         for fm in file_matches.values():
             for match in fm.matches:

@@ -0,0 +1,36 @@
+"""CJK-aware relevance scoring in the search compressor.
+
+The relevance scorer tokenized the query on whitespace, so a spaceless CJK query
+matched content only when the WHOLE query was a literal substring of a line. CJK
+char bigrams now let a longer CJK query boost lines that share a substring. The
+Rust<->Python parity was also hardened (dedup like Python's set; char-length
+filter instead of bytes). These exercise the Python legacy scorer that mirrors
+Rust.
+"""
+
+from headroom.transforms.search_compressor import (
+    SearchCompressor,
+    SearchCompressorConfig,
+    _cjk_bigrams,
+)
+
+
+def test_cjk_bigrams_from_runs():
+    assert _cjk_bigrams("认证令牌") == {"认证", "证令", "令牌"}
+    assert _cjk_bigrams("hello world") == set()  # ASCII -> no CJK bigrams
+    assert _cjk_bigrams("a认b证") == set()  # isolated CJK chars -> no adjacent pair
+
+
+def test_score_matches_cjk_query_bigrams_boost():
+    compressor = SearchCompressor(SearchCompressorConfig(boost_errors=False, context_keywords=[]))
+    content = "\n".join(
+        [
+            "src/a.py:10:认证令牌已过期需要重新登录",
+            "src/b.py:2:plain ascii content here",
+        ]
+    )
+    parsed = compressor._parse_search_results(content)
+    # the whole query is NOT a substring of the content line, but its bigrams are
+    compressor._score_matches(parsed, "认证令牌缓存淘汰策略")
+    assert parsed["src/a.py"].matches[0].score > 0  # 认证/证令/令牌 bigrams match
+    assert parsed["src/b.py"].matches[0].score == 0