From a216a05d06483c1233baa6c908ac40d8787185a5 Mon Sep 17 00:00:00 2001 From: rohanprichard Date: Fri, 3 Jul 2026 16:24:55 +0530 Subject: [PATCH] fix(content-detector): detect and compress space-separated JSON objects Web search tools (SerpAPI, Tavily, custom backends) commonly return back-to-back JSON objects separated by whitespace ({...} {...} {...}) rather than a JSON array. detect_content_type only treated input starting with [ as JSON_ARRAY, so this shape fell through to PLAIN_TEXT and SmartCrusher passed it through at 0% compression. The detector now recognizes a run of >=2 whitespace-separated JSON objects as JSON_ARRAY, and SmartCrusher normalizes that shape to a real array before crushing. Measured ~34% byte reduction on a 100-result web_search payload that previously compressed 0%. Closes #1741 --- CHANGELOG.md | 8 +++ headroom/transforms/content_detector.py | 69 +++++++++++++++++++--- headroom/transforms/smart_crusher.py | 8 +++ tests/test_transforms_content_detection.py | 43 ++++++++++++++ 4 files changed, 120 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd36c2dfd..354b18ff3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `min(4, cpu)` × 1). The ONNX embedder already capped its threads; this brings the torch path to parity ([#198](https://github.com/headroomlabs-ai/headroom/issues/198)). +- Space-separated JSON objects (the shape many web-search tools emit — + `{...} {...} {...}` rather than a real array) are now detected as + `JSON_ARRAY` and compressed. `detect_content_type` only recognized inputs + starting with `[`, so this shape fell through to `PLAIN_TEXT` and SmartCrusher + passed it through at 0% compression; the detector now recognizes a run of two + or more whitespace-separated JSON objects and SmartCrusher normalizes them to + an array before crushing (~34% reduction on a 100-result web_search payload) + ([#1741](https://github.com/headroomlabs-ai/headroom/issues/1741)). ### Changed diff --git a/headroom/transforms/content_detector.py b/headroom/transforms/content_detector.py index 78ab52fb5..e678d1504 100644 --- a/headroom/transforms/content_detector.py +++ b/headroom/transforms/content_detector.py @@ -181,16 +181,57 @@ def detect_content_type(content: str) -> DetectionResult: return DetectionResult(ContentType.PLAIN_TEXT, 0.5, {}) +def _decode_concatenated_json(content: str) -> list | None: + """Decode a run of whitespace-separated top-level JSON values. + + Web search tools (SerpAPI, Tavily, custom backends) commonly emit + back-to-back JSON objects separated only by whitespace rather than a real + array: ``{"title": ...} {"title": ...} {"title": ...}``. Returns the list + of decoded values, or None if the text isn't a clean run of JSON values + separated only by whitespace. + """ + decoder = json.JSONDecoder() + idx, length = 0, len(content) + items: list = [] + while idx < length: + while idx < length and content[idx].isspace(): + idx += 1 + if idx >= length: + break + try: + value, idx = decoder.raw_decode(content, idx) + except ValueError: + return None + items.append(value) + return items or None + + +def normalize_concatenated_json(content: str) -> str | None: + """Convert whitespace-separated JSON objects into a canonical JSON array. + + SmartCrusher only compresses JSON arrays, so this rewrites the + space-separated web_search shape (``{...} {...} {...}``) into + ``[{...}, {...}, {...}]``. Returns None unless the content is two or more + whitespace-separated JSON objects. + """ + stripped = content.strip() + if not stripped.startswith("{"): + return None + items = _decode_concatenated_json(stripped) + if items and len(items) >= 2 and all(isinstance(item, dict) for item in items): + return json.dumps(items) + return None + + def _try_detect_json(content: str) -> DetectionResult | None: """Try to detect JSON array content.""" content = content.strip() - # Quick check: must start with [ for array - if not content.startswith("["): - return None - - try: - parsed = json.loads(content) + if content.startswith("["): + try: + parsed = json.loads(content) + except json.JSONDecodeError: + return None if isinstance(parsed, list): # Check if it's a list of dicts (SmartCrusher compatible) if parsed and all(isinstance(item, dict) for item in parsed): @@ -205,8 +246,20 @@ def _try_detect_json(content: str) -> DetectionResult | None: 0.8, {"item_count": len(parsed), "is_dict_array": False}, ) - except json.JSONDecodeError: - pass + return None + + # Space-separated JSON objects (typical web_search output) aren't a valid + # array, so they'd fall through to PLAIN_TEXT and skip SmartCrusher + # entirely. Treat a run of ≥2 whitespace-separated dict objects as a + # JSON_ARRAY; SmartCrusher normalizes it to a real array before crushing. + if content.startswith("{"): + items = _decode_concatenated_json(content) + if items and len(items) >= 2 and all(isinstance(item, dict) for item in items): + return DetectionResult( + ContentType.JSON_ARRAY, + 1.0, + {"item_count": len(items), "is_dict_array": True, "concatenated": True}, + ) return None diff --git a/headroom/transforms/smart_crusher.py b/headroom/transforms/smart_crusher.py index bf8c1ed8f..249e37d34 100644 --- a/headroom/transforms/smart_crusher.py +++ b/headroom/transforms/smart_crusher.py @@ -54,6 +54,7 @@ from ..tokenizer import Tokenizer from ..utils import compute_short_hash, create_tool_digest_marker, deep_copy_messages from .base import Transform +from .content_detector import normalize_concatenated_json logger = logging.getLogger(__name__) @@ -446,6 +447,13 @@ def crush( opaque-blob offload) leaves the content uncompacted instead. `None` (default) uses the instance's configured value. """ + # Web search tools often return space-separated JSON objects + # (``{...} {...} {...}``) rather than a real array. The Rust crusher + # only compresses JSON arrays, so normalize that shape first — + # otherwise it passes through at 0% compression (#1741). + normalized = normalize_concatenated_json(content) + if normalized is not None: + content = normalized rust = ( self._rust if lossless_only is None or bool(lossless_only) == self._lossless_only diff --git a/tests/test_transforms_content_detection.py b/tests/test_transforms_content_detection.py index d08a8d127..d209ecabc 100644 --- a/tests/test_transforms_content_detection.py +++ b/tests/test_transforms_content_detection.py @@ -1,5 +1,7 @@ from __future__ import annotations +import json + from headroom.transforms.content_detector import ( ContentType, _try_detect_code, @@ -10,6 +12,7 @@ _try_detect_search, detect_content_type, is_json_array_of_dicts, + normalize_concatenated_json, ) from headroom.transforms.error_detection import ( ERROR_INDICATOR_KEYWORDS, @@ -59,6 +62,46 @@ def test_json_detection_distinguishes_dict_arrays_and_other_lists() -> None: assert is_json_array_of_dicts('["value"]') is False +def test_space_separated_json_objects_detected_as_array() -> None: + # Typical web_search output: back-to-back JSON objects, no array brackets. + content = " ".join( + json.dumps({"title": f"Result {i}", "url": f"http://example.com/{i}"}) for i in range(3) + ) + result = _try_detect_json(content) + assert result is not None + assert result.content_type is ContentType.JSON_ARRAY + assert result.confidence == 1.0 + assert result.metadata == {"item_count": 3, "is_dict_array": True, "concatenated": True} + + # Reaches the same verdict through the top-level detector (not PLAIN_TEXT). + assert detect_content_type(content).content_type is ContentType.JSON_ARRAY + assert is_json_array_of_dicts(content) is True + + # Newline separation is just as common and must also be recognized. + newline_sep = "\n".join(json.dumps({"id": i, "snippet": "x"}) for i in range(2)) + assert _try_detect_json(newline_sep).content_type is ContentType.JSON_ARRAY + + +def test_space_separated_json_detection_is_conservative() -> None: + # A single object is not an array — must not be claimed. + assert _try_detect_json('{"id": 1}') is None + # Objects interleaved with prose are not clean concatenated JSON. + assert _try_detect_json('{"id": 1} then some prose {"id": 2}') is None + # Scalars/strings between objects disqualify the run of dicts. + assert _try_detect_json('{"id": 1} "loose string"') is None + + +def test_normalize_concatenated_json_roundtrips_to_array() -> None: + content = '{"a": 1} {"b": 2}' + normalized = normalize_concatenated_json(content) + assert normalized is not None + assert json.loads(normalized) == [{"a": 1}, {"b": 2}] + + # Already-valid arrays and single objects are left for the caller as-is. + assert normalize_concatenated_json('[{"a": 1}]') is None + assert normalize_concatenated_json('{"a": 1}') is None + + def test_diff_detection_tracks_headers_and_changes() -> None: diff = "\n".join( [