Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 46 additions & 3 deletions crates/headroom-core/src/transforms/search_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,36 @@ use md5::{Digest, Md5};

use crate::ccr::CcrStore;
use crate::signals::{ImportanceContext, LineImportanceDetector};

/// True for CJK ideographs, kana, and Hangul. Code-point ranges kept
/// byte-identical with the Python `_is_cjk_char` for search-compressor parity.
fn is_cjk_char(c: char) -> bool {
matches!(
c as u32,
0x3040..=0x30FF | 0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xAC00..=0xD7AF | 0xF900..=0xFAFF
)
}

/// CJK character bigrams from the CJK runs of a (lowercased) query, so a
/// spaceless CJK query can match content. Mirrors the Python `_cjk_bigrams`.
fn cjk_bigrams(text: &str) -> BTreeSet<String> {
let mut out = BTreeSet::new();
let mut run: Vec<char> = Vec::new();
for c in text.chars() {
if is_cjk_char(c) {
run.push(c);
} else {
for w in run.windows(2) {
out.insert(w.iter().collect::<String>());
}
run.clear();
}
}
for w in run.windows(2) {
out.insert(w.iter().collect::<String>());
}
out
}
use crate::transforms::adaptive_sizer::compute_optimal_k;

// ─── Types ──────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -363,18 +393,23 @@ impl SearchCompressor {

pub fn score_matches(&self, files: &mut BTreeMap<String, FileMatches>, context: &str) {
let context_lower = context.to_ascii_lowercase();
let context_words: Vec<&str> = context_lower
// Dedup like Python's `set`; count length in CHARS (not bytes) to match
// Python codepoints; and add CJK char bigrams so a spaceless CJK query
// (no whitespace words to split on) can still match content.
let mut context_words: BTreeSet<String> = context_lower
.split_whitespace()
.filter(|w| w.len() > 2)
.filter(|w| w.chars().count() > 2)
.map(|w| w.to_string())
.collect();
context_words.extend(cjk_bigrams(&context_lower));

for fm in files.values_mut() {
for m in &mut fm.matches {
let mut score: f32 = 0.0;
let content_lower = m.content.to_ascii_lowercase();

for w in &context_words {
if content_lower.contains(w) {
if content_lower.contains(w.as_str()) {
score += 0.3;
}
}
Expand Down Expand Up @@ -682,6 +717,14 @@ mod tests {
);
}

#[test]
fn cjk_bigrams_from_runs() {
let b = cjk_bigrams("认证令牌");
assert!(b.contains("认证") && b.contains("证令") && b.contains("令牌") && b.len() == 3);
assert!(cjk_bigrams("hello").is_empty());
assert!(cjk_bigrams("a认b证").is_empty()); // isolated CJK chars -> no pair
}

#[test]
fn parses_ripgrep_context_line() {
assert_eq!(
Expand Down
43 changes: 40 additions & 3 deletions headroom/transforms/search_compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,36 @@
logger = logging.getLogger(__name__)


def _is_cjk_char(c: str) -> bool:
"""True for CJK ideographs, kana, and Hangul. Code-point ranges kept
byte-identical with the Rust `is_cjk_char` for search-compressor parity."""
o = ord(c)
return (
0x3040 <= o <= 0x30FF
or 0x3400 <= o <= 0x4DBF
or 0x4E00 <= o <= 0x9FFF
or 0xAC00 <= o <= 0xD7AF
or 0xF900 <= o <= 0xFAFF
)


def _cjk_bigrams(text: str) -> set[str]:
"""CJK character bigrams from the CJK runs of a (lowercased) query, so a
spaceless CJK query can match content. Mirrors the Rust `cjk_bigrams`."""
out: set[str] = set()
run: list[str] = []
for c in text:
if _is_cjk_char(c):
run.append(c)
else:
for i in range(len(run) - 1):
out.add(run[i] + run[i + 1])
run = []
for i in range(len(run) - 1):
out.add(run[i] + run[i + 1])
return out


# ─── Public dataclasses (preserve existing import surface) ──────────────────


Expand Down Expand Up @@ -223,14 +253,21 @@ def _score_matches(

Stays Python so the legacy direct-call test surface keeps
working without rebuilding through Rust on every test. The
scoring constants must mirror Rust `SearchCompressor::score_matches`
— Rust unit tests pin Rust's behavior; the parity assertion at
the bottom of this module pins both sides agree.
scoring constants mirror Rust `SearchCompressor::score_matches`,
pinned by Rust unit tests and Python tests over the same inputs:
word-overlap and CJK-bigram scoring are byte-equal. (The error-
boost keyword set still diverges for a few terms fixed only on
the Rust side -- see keyword_detector; there is no cross-impl
assertion, so this equality is test-pinned, not mechanically
enforced.)
"""
from headroom.transforms.error_detection import PRIORITY_PATTERNS_SEARCH

context_lower = context.lower()
# Dedup whitespace words (len>2 by codepoints), and add CJK char bigrams
# so a spaceless CJK query can match content.
context_words = {w for w in context_lower.split() if len(w) > 2}
context_words |= _cjk_bigrams(context_lower)

for fm in file_matches.values():
for match in fm.matches:
Expand Down
36 changes: 36 additions & 0 deletions tests/test_search_compressor_cjk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""CJK-aware relevance scoring in the search compressor.

The relevance scorer tokenized the query on whitespace, so a spaceless CJK query
matched content only when the WHOLE query was a literal substring of a line. CJK
char bigrams now let a longer CJK query boost lines that share a substring. The
Rust<->Python parity was also hardened (dedup like Python's set; char-length
filter instead of bytes). These exercise the Python legacy scorer that mirrors
Rust.
"""

from headroom.transforms.search_compressor import (
SearchCompressor,
SearchCompressorConfig,
_cjk_bigrams,
)


def test_cjk_bigrams_from_runs():
assert _cjk_bigrams("认证令牌") == {"认证", "证令", "令牌"}
assert _cjk_bigrams("hello world") == set() # ASCII -> no CJK bigrams
assert _cjk_bigrams("a认b证") == set() # isolated CJK chars -> no adjacent pair


def test_score_matches_cjk_query_bigrams_boost():
compressor = SearchCompressor(SearchCompressorConfig(boost_errors=False, context_keywords=[]))
content = "\n".join(
[
"src/a.py:10:认证令牌已过期需要重新登录",
"src/b.py:2:plain ascii content here",
]
)
parsed = compressor._parse_search_results(content)
# the whole query is NOT a substring of the content line, but its bigrams are
compressor._score_matches(parsed, "认证令牌缓存淘汰策略")
assert parsed["src/a.py"].matches[0].score > 0 # 认证/证令/令牌 bigrams match
assert parsed["src/b.py"].matches[0].score == 0
Loading