From a216a05d06483c1233baa6c908ac40d8787185a5 Mon Sep 17 00:00:00 2001
From: rohanprichard <rohanprichard@gmail.com>
Date: Fri, 3 Jul 2026 16:24:55 +0530
Subject: [PATCH] fix(content-detector): detect and compress space-separated
 JSON objects

Web search tools (SerpAPI, Tavily, custom backends) commonly return
back-to-back JSON objects separated by whitespace ({...} {...} {...})
rather than a JSON array. detect_content_type only treated input
starting with [ as JSON_ARRAY, so this shape fell through to PLAIN_TEXT
and SmartCrusher passed it through at 0% compression.

The detector now recognizes a run of >=2 whitespace-separated JSON
objects as JSON_ARRAY, and SmartCrusher normalizes that shape to a real
array before crushing. Measured ~34% byte reduction on a 100-result
web_search payload that previously compressed 0%.

Closes #1741
---
 CHANGELOG.md                               |  8 +++
 headroom/transforms/content_detector.py    | 69 +++++++++++++++++++---
 headroom/transforms/smart_crusher.py       |  8 +++
 tests/test_transforms_content_detection.py | 43 ++++++++++++++
 4 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fd36c2dfd..354b18ff3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   `min(4, cpu)` × 1). The ONNX embedder already capped its threads; this brings
   the torch path to parity
   ([#198](https://github.com/headroomlabs-ai/headroom/issues/198)).
+- Space-separated JSON objects (the shape many web-search tools emit —
+  `{...} {...} {...}` rather than a real array) are now detected as
+  `JSON_ARRAY` and compressed. `detect_content_type` only recognized inputs
+  starting with `[`, so this shape fell through to `PLAIN_TEXT` and SmartCrusher
+  passed it through at 0% compression; the detector now recognizes a run of two
+  or more whitespace-separated JSON objects and SmartCrusher normalizes them to
+  an array before crushing (~34% reduction on a 100-result web_search payload)
+  ([#1741](https://github.com/headroomlabs-ai/headroom/issues/1741)).
 
 ### Changed
 
diff --git a/headroom/transforms/content_detector.py b/headroom/transforms/content_detector.py
index 78ab52fb5..e678d1504 100644
--- a/headroom/transforms/content_detector.py
+++ b/headroom/transforms/content_detector.py
@@ -181,16 +181,57 @@ def detect_content_type(content: str) -> DetectionResult:
     return DetectionResult(ContentType.PLAIN_TEXT, 0.5, {})
 
 
+def _decode_concatenated_json(content: str) -> list | None:
+    """Decode a run of whitespace-separated top-level JSON values.
+
+    Web search tools (SerpAPI, Tavily, custom backends) commonly emit
+    back-to-back JSON objects separated only by whitespace rather than a real
+    array: ``{"title": ...} {"title": ...} {"title": ...}``. Returns the list
+    of decoded values, or None if the text isn't a clean run of JSON values
+    separated only by whitespace.
+    """
+    decoder = json.JSONDecoder()
+    idx, length = 0, len(content)
+    items: list = []
+    while idx < length:
+        while idx < length and content[idx].isspace():
+            idx += 1
+        if idx >= length:
+            break
+        try:
+            value, idx = decoder.raw_decode(content, idx)
+        except ValueError:
+            return None
+        items.append(value)
+    return items or None
+
+
+def normalize_concatenated_json(content: str) -> str | None:
+    """Convert whitespace-separated JSON objects into a canonical JSON array.
+
+    SmartCrusher only compresses JSON arrays, so this rewrites the
+    space-separated web_search shape (``{...} {...} {...}``) into
+    ``[{...}, {...}, {...}]``. Returns None unless the content is two or more
+    whitespace-separated JSON objects.
+    """
+    stripped = content.strip()
+    if not stripped.startswith("{"):
+        return None
+    items = _decode_concatenated_json(stripped)
+    if items and len(items) >= 2 and all(isinstance(item, dict) for item in items):
+        return json.dumps(items)
+    return None
+
+
 def _try_detect_json(content: str) -> DetectionResult | None:
     """Try to detect JSON array content."""
     content = content.strip()
 
-    # Quick check: must start with [ for array
-    if not content.startswith("["):
-        return None
-
-    try:
-        parsed = json.loads(content)
+    if content.startswith("["):
+        try:
+            parsed = json.loads(content)
+        except json.JSONDecodeError:
+            return None
         if isinstance(parsed, list):
             # Check if it's a list of dicts (SmartCrusher compatible)
             if parsed and all(isinstance(item, dict) for item in parsed):
@@ -205,8 +246,20 @@ def _try_detect_json(content: str) -> DetectionResult | None:
                 0.8,
                 {"item_count": len(parsed), "is_dict_array": False},
             )
-    except json.JSONDecodeError:
-        pass
+        return None
+
+    # Space-separated JSON objects (typical web_search output) aren't a valid
+    # array, so they'd fall through to PLAIN_TEXT and skip SmartCrusher
+    # entirely. Treat a run of ≥2 whitespace-separated dict objects as a
+    # JSON_ARRAY; SmartCrusher normalizes it to a real array before crushing.
+    if content.startswith("{"):
+        items = _decode_concatenated_json(content)
+        if items and len(items) >= 2 and all(isinstance(item, dict) for item in items):
+            return DetectionResult(
+                ContentType.JSON_ARRAY,
+                1.0,
+                {"item_count": len(items), "is_dict_array": True, "concatenated": True},
+            )
 
     return None
 
diff --git a/headroom/transforms/smart_crusher.py b/headroom/transforms/smart_crusher.py
index bf8c1ed8f..249e37d34 100644
--- a/headroom/transforms/smart_crusher.py
+++ b/headroom/transforms/smart_crusher.py
@@ -54,6 +54,7 @@
 from ..tokenizer import Tokenizer
 from ..utils import compute_short_hash, create_tool_digest_marker, deep_copy_messages
 from .base import Transform
+from .content_detector import normalize_concatenated_json
 
 logger = logging.getLogger(__name__)
 
@@ -446,6 +447,13 @@ def crush(
         opaque-blob offload) leaves the content uncompacted instead.
         `None` (default) uses the instance's configured value.
         """
+        # Web search tools often return space-separated JSON objects
+        # (``{...} {...} {...}``) rather than a real array. The Rust crusher
+        # only compresses JSON arrays, so normalize that shape first —
+        # otherwise it passes through at 0% compression (#1741).
+        normalized = normalize_concatenated_json(content)
+        if normalized is not None:
+            content = normalized
         rust = (
             self._rust
             if lossless_only is None or bool(lossless_only) == self._lossless_only
diff --git a/tests/test_transforms_content_detection.py b/tests/test_transforms_content_detection.py
index d08a8d127..d209ecabc 100644
--- a/tests/test_transforms_content_detection.py
+++ b/tests/test_transforms_content_detection.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import json
+
 from headroom.transforms.content_detector import (
     ContentType,
     _try_detect_code,
@@ -10,6 +12,7 @@
     _try_detect_search,
     detect_content_type,
     is_json_array_of_dicts,
+    normalize_concatenated_json,
 )
 from headroom.transforms.error_detection import (
     ERROR_INDICATOR_KEYWORDS,
@@ -59,6 +62,46 @@ def test_json_detection_distinguishes_dict_arrays_and_other_lists() -> None:
     assert is_json_array_of_dicts('["value"]') is False
 
 
+def test_space_separated_json_objects_detected_as_array() -> None:
+    # Typical web_search output: back-to-back JSON objects, no array brackets.
+    content = " ".join(
+        json.dumps({"title": f"Result {i}", "url": f"http://example.com/{i}"}) for i in range(3)
+    )
+    result = _try_detect_json(content)
+    assert result is not None
+    assert result.content_type is ContentType.JSON_ARRAY
+    assert result.confidence == 1.0
+    assert result.metadata == {"item_count": 3, "is_dict_array": True, "concatenated": True}
+
+    # Reaches the same verdict through the top-level detector (not PLAIN_TEXT).
+    assert detect_content_type(content).content_type is ContentType.JSON_ARRAY
+    assert is_json_array_of_dicts(content) is True
+
+    # Newline separation is just as common and must also be recognized.
+    newline_sep = "\n".join(json.dumps({"id": i, "snippet": "x"}) for i in range(2))
+    assert _try_detect_json(newline_sep).content_type is ContentType.JSON_ARRAY
+
+
+def test_space_separated_json_detection_is_conservative() -> None:
+    # A single object is not an array — must not be claimed.
+    assert _try_detect_json('{"id": 1}') is None
+    # Objects interleaved with prose are not clean concatenated JSON.
+    assert _try_detect_json('{"id": 1} then some prose {"id": 2}') is None
+    # Scalars/strings between objects disqualify the run of dicts.
+    assert _try_detect_json('{"id": 1} "loose string"') is None
+
+
+def test_normalize_concatenated_json_roundtrips_to_array() -> None:
+    content = '{"a": 1} {"b": 2}'
+    normalized = normalize_concatenated_json(content)
+    assert normalized is not None
+    assert json.loads(normalized) == [{"a": 1}, {"b": 2}]
+
+    # Already-valid arrays and single objects are left for the caller as-is.
+    assert normalize_concatenated_json('[{"a": 1}]') is None
+    assert normalize_concatenated_json('{"a": 1}') is None
+
+
 def test_diff_detection_tracks_headers_and_changes() -> None:
     diff = "\n".join(
         [