Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
`min(4, cpu)` × 1). The ONNX embedder already capped its threads; this brings
the torch path to parity
([#198](https://github.com/headroomlabs-ai/headroom/issues/198)).
- Space-separated JSON objects (the shape many web-search tools emit —
`{...} {...} {...}` rather than a real array) are now detected as
`JSON_ARRAY` and compressed. `detect_content_type` only recognized inputs
starting with `[`, so this shape fell through to `PLAIN_TEXT` and SmartCrusher
passed it through at 0% compression; the detector now recognizes a run of two
or more whitespace-separated JSON objects and SmartCrusher normalizes them to
an array before crushing (~34% reduction on a 100-result web_search payload)
([#1741](https://github.com/headroomlabs-ai/headroom/issues/1741)).

### Changed

Expand Down
69 changes: 61 additions & 8 deletions headroom/transforms/content_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,16 +181,57 @@ def detect_content_type(content: str) -> DetectionResult:
return DetectionResult(ContentType.PLAIN_TEXT, 0.5, {})


def _decode_concatenated_json(content: str) -> list | None:
"""Decode a run of whitespace-separated top-level JSON values.

Web search tools (SerpAPI, Tavily, custom backends) commonly emit
back-to-back JSON objects separated only by whitespace rather than a real
array: ``{"title": ...} {"title": ...} {"title": ...}``. Returns the list
of decoded values, or None if the text isn't a clean run of JSON values
separated only by whitespace.
"""
decoder = json.JSONDecoder()
idx, length = 0, len(content)
items: list = []
while idx < length:
while idx < length and content[idx].isspace():
idx += 1
if idx >= length:
break
try:
value, idx = decoder.raw_decode(content, idx)
except ValueError:
return None
items.append(value)
return items or None


def normalize_concatenated_json(content: str) -> str | None:
"""Convert whitespace-separated JSON objects into a canonical JSON array.

SmartCrusher only compresses JSON arrays, so this rewrites the
space-separated web_search shape (``{...} {...} {...}``) into
``[{...}, {...}, {...}]``. Returns None unless the content is two or more
whitespace-separated JSON objects.
"""
stripped = content.strip()
if not stripped.startswith("{"):
return None
items = _decode_concatenated_json(stripped)
if items and len(items) >= 2 and all(isinstance(item, dict) for item in items):
return json.dumps(items)
return None


def _try_detect_json(content: str) -> DetectionResult | None:
"""Try to detect JSON array content."""
content = content.strip()

# Quick check: must start with [ for array
if not content.startswith("["):
return None

try:
parsed = json.loads(content)
if content.startswith("["):
try:
parsed = json.loads(content)
except json.JSONDecodeError:
return None
if isinstance(parsed, list):
# Check if it's a list of dicts (SmartCrusher compatible)
if parsed and all(isinstance(item, dict) for item in parsed):
Expand All @@ -205,8 +246,20 @@ def _try_detect_json(content: str) -> DetectionResult | None:
0.8,
{"item_count": len(parsed), "is_dict_array": False},
)
except json.JSONDecodeError:
pass
return None

# Space-separated JSON objects (typical web_search output) aren't a valid
# array, so they'd fall through to PLAIN_TEXT and skip SmartCrusher
# entirely. Treat a run of ≥2 whitespace-separated dict objects as a
# JSON_ARRAY; SmartCrusher normalizes it to a real array before crushing.
if content.startswith("{"):
items = _decode_concatenated_json(content)
if items and len(items) >= 2 and all(isinstance(item, dict) for item in items):
return DetectionResult(
ContentType.JSON_ARRAY,
1.0,
{"item_count": len(items), "is_dict_array": True, "concatenated": True},
)

return None

Expand Down
8 changes: 8 additions & 0 deletions headroom/transforms/smart_crusher.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from ..tokenizer import Tokenizer
from ..utils import compute_short_hash, create_tool_digest_marker, deep_copy_messages
from .base import Transform
from .content_detector import normalize_concatenated_json

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -446,6 +447,13 @@ def crush(
opaque-blob offload) leaves the content uncompacted instead.
`None` (default) uses the instance's configured value.
"""
# Web search tools often return space-separated JSON objects
# (``{...} {...} {...}``) rather than a real array. The Rust crusher
# only compresses JSON arrays, so normalize that shape first —
# otherwise it passes through at 0% compression (#1741).
normalized = normalize_concatenated_json(content)
if normalized is not None:
content = normalized
rust = (
self._rust
if lossless_only is None or bool(lossless_only) == self._lossless_only
Expand Down
43 changes: 43 additions & 0 deletions tests/test_transforms_content_detection.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

import json

from headroom.transforms.content_detector import (
ContentType,
_try_detect_code,
Expand All @@ -10,6 +12,7 @@
_try_detect_search,
detect_content_type,
is_json_array_of_dicts,
normalize_concatenated_json,
)
from headroom.transforms.error_detection import (
ERROR_INDICATOR_KEYWORDS,
Expand Down Expand Up @@ -59,6 +62,46 @@ def test_json_detection_distinguishes_dict_arrays_and_other_lists() -> None:
assert is_json_array_of_dicts('["value"]') is False


def test_space_separated_json_objects_detected_as_array() -> None:
# Typical web_search output: back-to-back JSON objects, no array brackets.
content = " ".join(
json.dumps({"title": f"Result {i}", "url": f"http://example.com/{i}"}) for i in range(3)
)
result = _try_detect_json(content)
assert result is not None
assert result.content_type is ContentType.JSON_ARRAY
assert result.confidence == 1.0
assert result.metadata == {"item_count": 3, "is_dict_array": True, "concatenated": True}

# Reaches the same verdict through the top-level detector (not PLAIN_TEXT).
assert detect_content_type(content).content_type is ContentType.JSON_ARRAY
assert is_json_array_of_dicts(content) is True

# Newline separation is just as common and must also be recognized.
newline_sep = "\n".join(json.dumps({"id": i, "snippet": "x"}) for i in range(2))
assert _try_detect_json(newline_sep).content_type is ContentType.JSON_ARRAY


def test_space_separated_json_detection_is_conservative() -> None:
# A single object is not an array — must not be claimed.
assert _try_detect_json('{"id": 1}') is None
# Objects interleaved with prose are not clean concatenated JSON.
assert _try_detect_json('{"id": 1} then some prose {"id": 2}') is None
# Scalars/strings between objects disqualify the run of dicts.
assert _try_detect_json('{"id": 1} "loose string"') is None


def test_normalize_concatenated_json_roundtrips_to_array() -> None:
content = '{"a": 1} {"b": 2}'
normalized = normalize_concatenated_json(content)
assert normalized is not None
assert json.loads(normalized) == [{"a": 1}, {"b": 2}]

# Already-valid arrays and single objects are left for the caller as-is.
assert normalize_concatenated_json('[{"a": 1}]') is None
assert normalize_concatenated_json('{"a": 1}') is None


def test_diff_detection_tracks_headers_and_changes() -> None:
diff = "\n".join(
[
Expand Down
Loading