From eb2093bf8f98fe516044b2a23102eb20e3c69d98 Mon Sep 17 00:00:00 2001
From: Parideboy <Boyhot96ita@hotmail.it>
Date: Thu, 2 Jul 2026 18:34:38 +0200
Subject: [PATCH] fix(core): load ONNX Runtime dynamically so headroom._core
 imports on non-AVX2 x86-64

fastembed's ort-download-binaries-rustls-tls feature statically links
Microsoft's prebuilt ONNX Runtime into the extension on non-Windows
targets. That prebuilt x86_64 binary requires AVX2 and its code runs as
soon as the module loads, so import headroom._core died with SIGILL on
pre-AVX2 CPUs before the runtime AVX2 guard from #1162 could intervene.

Build with ort-load-dynamic on every platform (Windows already did, for
DirectML link-lib reasons), collapsing the two identical target blocks
into one dependency. ORT is now only dlopen'd at first use, where the
AVX2 guard falls back to the non-ONNX detection tiers.

To keep Magika/fastembed working out of the box, extend the Windows-only
ORT_DYLIB_PATH auto-pin in headroom/_ort.py to all platforms: it now
resolves the pip onnxruntime package's shared library (.dll/.so/.dylib),
whose CPU wheels use runtime dispatch and run on pre-AVX2 machines too.

Fixes #1278

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 Cargo.lock                              | 18 -------
 crates/headroom-core/Cargo.toml         | 21 ++++----
 docs/content/docs/configuration.mdx     |  2 +-
 headroom/_ort.py                        | 65 ++++++++++++++++---------
 tests/test_transforms/test_ort_dylib.py | 41 ++++++++++++----
 5 files changed, 85 insertions(+), 62 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2123f6ea5..84cf9da4c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2021,12 +2021,6 @@ dependencies = [
  "digest 0.11.3",
 ]
 
-[[package]]
-name = "hmac-sha256"
-version = "1.1.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec9d92d097f4749b64e8cc33d924d9f40a2d4eb91402b458014b781f5733d60f"
-
 [[package]]
 name = "http"
 version = "0.2.12"
@@ -2583,12 +2577,6 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
 
-[[package]]
-name = "lzma-rust2"
-version = "0.15.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e20f57f9918e5bd7bc58c22cdd70a6afc7375d4dd9683af5f2b34bd3d2bba619"
-
 [[package]]
 name = "macro_rules_attribute"
 version = "0.2.2"
@@ -2941,7 +2929,6 @@ dependencies = [
  "ort-sys",
  "smallvec",
  "tracing",
- "ureq 3.3.0",
 ]
 
 [[package]]
@@ -2949,11 +2936,6 @@ name = "ort-sys"
 version = "2.0.0-rc.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90"
-dependencies = [
- "hmac-sha256",
- "lzma-rust2",
- "ureq 3.3.0",
-]
 
 [[package]]
 name = "outref"
diff --git a/crates/headroom-core/Cargo.toml b/crates/headroom-core/Cargo.toml
index a5705ba8a..fec94392c 100644
--- a/crates/headroom-core/Cargo.toml
+++ b/crates/headroom-core/Cargo.toml
@@ -122,17 +122,16 @@ redis = { version = "0.27", optional = true, default-features = false }
 # cycling through the proxy crate. Tiny crate (no I/O, just types).
 http = "1"
 
-[target.'cfg(not(windows))'.dependencies]
-fastembed = { version = "5", default-features = false, features = [
-    "hf-hub-rustls-tls",
-    "ort-download-binaries-rustls-tls",
-    "image-models",
-] }
-
-[target.'cfg(windows)'.dependencies]
-# `ort-download-binaries-*` emits DirectML link libs on Windows (`DXCORE`,
-# `DXGI`, `D3D12`, `DirectML`). Users installing `headroom-ai[all]` from
-# sdist often do not have those SDK libs, so load ORT dynamically instead.
+# Load ONNX Runtime dynamically on every platform. The alternative,
+# `ort-download-binaries-*`, statically links Microsoft's prebuilt ORT:
+# on Windows it emits DirectML link libs (`DXCORE`, `DXGI`, `D3D12`,
+# `DirectML`) that sdist installs of `headroom-ai[all]` often lack, and
+# on x86_64 Linux/macOS the prebuilt binary requires AVX2 — its code is
+# mapped and initialized as soon as the `headroom._core` extension
+# loads, so importing headroom SIGILLed on pre-AVX2 CPUs before the
+# runtime AVX2 guard could run (#1278). With `ort-load-dynamic` the
+# library is only dlopen'd at first use, where the AVX2 guard falls
+# back to the non-ONNX detection tiers.
 fastembed = { version = "5", default-features = false, features = [
     "hf-hub-rustls-tls",
     "ort-load-dynamic",
diff --git a/docs/content/docs/configuration.mdx b/docs/content/docs/configuration.mdx
index 2c53089fc..eaafcd62a 100644
--- a/docs/content/docs/configuration.mdx
+++ b/docs/content/docs/configuration.mdx
@@ -246,7 +246,7 @@ headroom proxy --learn --min-evidence 3
 | `HEADROOM_STRIP_INTERNAL_HEADERS` | Python proxy: whether to strip internal `x-headroom-*` request headers (e.g. `x-headroom-bypass`, `x-headroom-mode`, `x-headroom-user-id`, `x-headroom-stack`, `x-headroom-base-url`) before every upstream forwarder call (PR-A5, fixes P5-49). `enabled` (default) stops fingerprinting / leakage. `disabled` is an explicit operator opt-in for diagnostic shadow tracing — NOT a fallback. Inbound reads of these headers (bypass gating, memory user-id resolution) are unaffected because they read `request.headers` directly. | `enabled` |
 | `HEADROOM_PROXY_STRIP_INTERNAL_HEADERS` | Rust proxy: same policy as `HEADROOM_STRIP_INTERNAL_HEADERS` but for the Rust transparent proxy. Stripping happens inside `build_forward_request_headers` so both HTTP and WebSocket upstream calls are gated by one flag. `enabled` default; `disabled` operator opt-in for diagnostic shadow tracing. Response-side `X-Headroom-*` injection (e.g. `x-headroom-tokens-saved`) is unrelated and stays. | `enabled` |
 | `HEADROOM_EMBEDDER_RUNTIME` | Set to `pytorch_mps` to run the memory embedder via the torch sentence-transformers backend on the Apple GPU (MPS). Only engages when Apple MPS is actually available; otherwise it logs a warning and uses the existing default embedder selection path. `pytorch_mps` is the only accepted value. Requires the `[pytorch-mps]` extra. See [Memory](/docs/memory#embedding-runtime--gpu-offload-apple-silicon). | default embedder selection |
-| `ORT_DYLIB_PATH` | Windows: path to the `onnxruntime.dll` loaded by the Rust core (magika detection, fastembed embeddings). Auto-pinned at `import headroom` to the DLL inside the `onnxruntime` pip package; set it yourself to override. Without a pin the bare Windows DLL search resolves to the Windows ML System32 build (1.17.x on Win11 24H2+), which deadlocks ONNX session init — see [Troubleshooting](/docs/troubleshooting#windows-ml-content-detection-hangs-or-silently-falls-back). | auto-pinned on Windows |
+| `ORT_DYLIB_PATH` | Path to the ONNX Runtime shared library loaded by the Rust core (magika detection, fastembed embeddings), which loads ORT dynamically on every platform. Auto-pinned at `import headroom` to the library inside the `onnxruntime` pip package (`onnxruntime.dll` / `libonnxruntime.so*` / `libonnxruntime*.dylib`); set it yourself to override. Without a pin, ML detection degrades to the non-ONNX tiers — and on Windows the bare DLL search can resolve to the Windows ML System32 build (1.17.x on Win11 24H2+), which deadlocks ONNX session init — see [Troubleshooting](/docs/troubleshooting#windows-ml-content-detection-hangs-or-silently-falls-back). | auto-pinned |
 | `HEADROOM_MAGIKA_INIT_TIMEOUT_SECS` | Upper bound (integer seconds, > 0) on magika's one-time ONNX session init in the Rust detection chain. On timeout the init error is cached and detection uses the non-ML fallback tiers for the rest of the process; a warning is logged. Safety net for environments where the dylib pin above does not apply. | `5` |
 | `HEADROOM_REQUEST_TIMEOUT` | Request timeout in seconds | `300` |
 | `HEADROOM_BETA_HEADER_STICKY` | Controls per-session `anthropic-beta` / `OpenAI-Beta` re-echo. `enabled` (default): the proxy unions beta tokens across turns within a session — if the client sends a token in turn N and omits it in turn N+1, the proxy re-injects it to preserve prefix-cache stability. `disabled`: the client's value is forwarded verbatim with no accumulation. Any other value raises at request time. See [Session Beta Header Tracking](/docs/configuration#session-beta-header-tracking). | `enabled` |
diff --git a/headroom/_ort.py b/headroom/_ort.py
index 00a55d83c..c7082a591 100644
--- a/headroom/_ort.py
+++ b/headroom/_ort.py
@@ -1,12 +1,16 @@
-"""Pin the ONNX Runtime dylib for the Rust core on Windows.
+"""Pin the ONNX Runtime dylib for the Rust core.
 
 Why this module exists
 ----------------------
-On Windows, ``headroom._core`` consumers of the ``ort`` crate (magika
-content detection, fastembed embeddings) are built with
-``ort-load-dynamic``: the native ``onnxruntime.dll`` is resolved at
-*runtime*. Unless ``ORT_DYLIB_PATH`` is set, ort falls back to a bare
-``LoadLibrary("onnxruntime.dll")`` and the Windows DLL search order
+``headroom._core`` consumers of the ``ort`` crate (magika content
+detection, fastembed embeddings) are built with ``ort-load-dynamic`` on
+every platform: the native ONNX Runtime library is resolved at
+*runtime* rather than statically linked. (Static `ort-download-binaries`
+linking was dropped on Linux/macOS too because Microsoft's prebuilt
+x86_64 ORT requires AVX2 and executes at extension load, SIGILLing
+`import headroom._core` on pre-AVX2 CPUs — #1278.) Unless
+``ORT_DYLIB_PATH`` is set, ort falls back to a bare dlopen /
+``LoadLibrary("onnxruntime.dll")``; on Windows the DLL search order
 applies — and ``C:\\Windows\\System32`` wins.
 
 Windows 11 24H2+ ships ``System32\\onnxruntime.dll`` as part of Windows
@@ -19,13 +23,15 @@
 package's DLL (which ``headroom-ai[proxy]`` already depends on).
 
 The fix: before anything can import ``headroom._core``, resolve the
-pip-installed ``onnxruntime\\capi\\onnxruntime.dll`` and export it via
-``ORT_DYLIB_PATH``. ``headroom/__init__.py`` calls this hook, which
-guarantees ordering for every package-level consumer.
+pip-installed ``onnxruntime`` package's shared library
+(``capi/onnxruntime.dll`` / ``capi/libonnxruntime.so*`` /
+``capi/libonnxruntime*.dylib``) and export it via ``ORT_DYLIB_PATH``.
+``headroom/__init__.py`` calls this hook, which guarantees ordering for
+every package-level consumer.
 
 Behavior contract
 -----------------
-- Windows-only; a no-op everywhere else.
+- All platforms; pins only when the ``onnxruntime`` package is present.
 - Respects a pre-set ``ORT_DYLIB_PATH`` (user override wins).
 - Locates the ``onnxruntime`` package via ``find_spec`` WITHOUT
   importing it (importing would load its native code; this hook must
@@ -54,11 +60,11 @@
 
 
 def ensure_ort_dylib_pinned() -> str | None:
-    """Export ``ORT_DYLIB_PATH`` for the Rust core's ort runtime (Windows).
+    """Export ``ORT_DYLIB_PATH`` for the Rust core's ort runtime.
 
     Returns the effective dylib path (pinned now or already present in
-    the environment), or ``None`` when no pin applies (non-Windows, or
-    no ``onnxruntime`` package to point at). Idempotent and exception-free.
+    the environment), or ``None`` when no pin applies (no ``onnxruntime``
+    package to point at). Idempotent and exception-free.
     """
     global _pinned
     if _pinned is not _UNSET:
@@ -67,10 +73,21 @@ def ensure_ort_dylib_pinned() -> str | None:
     return _pinned  # type: ignore[return-value]
 
 
-def _resolve_and_pin() -> str | None:
-    if not sys.platform.startswith("win"):
-        return None
+def _find_dylib(capi: Path) -> Path | None:
+    """Return the platform's ONNX Runtime shared library inside ``capi``."""
+    if sys.platform.startswith("win"):
+        dll = capi / "onnxruntime.dll"
+        return dll if dll.is_file() else None
+    # Linux ships `libonnxruntime.so.<version>`, macOS `libonnxruntime.dylib`
+    # (sometimes versioned). Glob rather than hardcode the suffix.
+    for pattern in ("libonnxruntime.so*", "libonnxruntime*.dylib"):
+        for candidate in sorted(capi.glob(pattern)):
+            if candidate.is_file():
+                return candidate
+    return None
+
 
+def _resolve_and_pin() -> str | None:
     try:
         existing = os.environ.get(_ENV_VAR)
         if existing:
@@ -81,19 +98,21 @@ def _resolve_and_pin() -> str | None:
         if spec is None or not spec.origin:
             logger.debug(
                 "onnxruntime package not found; %s left unset. The Rust ML detection "
-                "may pick up the Windows ML System32 onnxruntime.dll, which is known "
-                "to deadlock ort init on Windows 11 24H2+ (it then degrades to non-ML "
-                "tiers via HEADROOM_MAGIKA_INIT_TIMEOUT_SECS). Install onnxruntime or "
-                "set %s explicitly.",
+                "cannot load ONNX Runtime and degrades to non-ML tiers (on Windows it "
+                "may instead pick up the Windows ML System32 onnxruntime.dll, which is "
+                "known to deadlock ort init on Windows 11 24H2+ and then degrades via "
+                "HEADROOM_MAGIKA_INIT_TIMEOUT_SECS). Install onnxruntime or set %s "
+                "explicitly.",
                 _ENV_VAR,
                 _ENV_VAR,
             )
             return None
 
-        dll = Path(spec.origin).parent / "capi" / "onnxruntime.dll"
-        if not dll.is_file():
+        dll = _find_dylib(Path(spec.origin).parent / "capi")
+        if dll is None:
             logger.debug(
-                "onnxruntime package found but %s is missing; %s left unset", dll, _ENV_VAR
+                "onnxruntime package found but its shared library is missing; %s left unset",
+                _ENV_VAR,
             )
             return None
 
diff --git a/tests/test_transforms/test_ort_dylib.py b/tests/test_transforms/test_ort_dylib.py
index 8930a07ab..5a79832c6 100644
--- a/tests/test_transforms/test_ort_dylib.py
+++ b/tests/test_transforms/test_ort_dylib.py
@@ -1,9 +1,12 @@
-"""Tests for headroom._ort — the Windows ORT_DYLIB_PATH auto-pin.
-
-The resolver guards the Rust core against the Windows DLL search picking
-up the Windows ML System32 onnxruntime.dll (deadlocks ort session init on
-Win11 24H2+, see headroom/_ort.py). The platform gate is monkeypatched so
-the full logic runs on any CI OS.
+"""Tests for headroom._ort — the ORT_DYLIB_PATH auto-pin.
+
+The resolver points the Rust core's ort-load-dynamic runtime at the pip
+onnxruntime package's shared library on every platform: on Windows it
+guards against the DLL search picking up the Windows ML System32
+onnxruntime.dll (deadlocks ort session init on Win11 24H2+), and on
+Linux/macOS it restores ML detection after static ORT linking was
+dropped for pre-AVX2 CPU compatibility (#1278). The platform is
+monkeypatched so every branch runs on any CI OS.
 """
 
 from __future__ import annotations
@@ -37,10 +40,30 @@ def _fake_spec_for(monkeypatch, package_dir):
     )
 
 
-def test_noop_on_non_windows(monkeypatch):
+def test_pins_versioned_so_on_linux(monkeypatch, tmp_path):
     monkeypatch.setattr(sys, "platform", "linux")
-    assert _ort.ensure_ort_dylib_pinned() is None
-    assert "ORT_DYLIB_PATH" not in _ort.os.environ
+    pkg = tmp_path / "onnxruntime"
+    capi = pkg / "capi"
+    capi.mkdir(parents=True)
+    so = capi / "libonnxruntime.so.1.22.0"
+    so.write_bytes(b"not really a shared object")
+    _fake_spec_for(monkeypatch, pkg)
+
+    assert _ort.ensure_ort_dylib_pinned() == str(so)
+    assert _ort.os.environ["ORT_DYLIB_PATH"] == str(so)
+
+
+def test_pins_dylib_on_macos(monkeypatch, tmp_path):
+    monkeypatch.setattr(sys, "platform", "darwin")
+    pkg = tmp_path / "onnxruntime"
+    capi = pkg / "capi"
+    capi.mkdir(parents=True)
+    dylib = capi / "libonnxruntime.dylib"
+    dylib.write_bytes(b"not really a dylib")
+    _fake_spec_for(monkeypatch, pkg)
+
+    assert _ort.ensure_ort_dylib_pinned() == str(dylib)
+    assert _ort.os.environ["ORT_DYLIB_PATH"] == str(dylib)
 
 
 def test_respects_existing_env(monkeypatch):