diff --git a/Makefile b/Makefile
index 4b2d1a8d5..5b480e53b 100644
--- a/Makefile
+++ b/Makefile
@@ -75,7 +75,7 @@ CONAN_CONFIG ?=
 CONAN_OVERRIDE ?=
 
 BUILD_VERSION ?= main
-PROFILE=default
+PROFILE ?= default
 BUILD_TYPE=Release
 
 # Note that, `benchmarks` and `test coverage` shouldn't  be included in conan's options/configs,
diff --git a/bolt/row/CMakeLists.txt b/bolt/row/CMakeLists.txt
index e840e41ea..a06025be5 100644
--- a/bolt/row/CMakeLists.txt
+++ b/bolt/row/CMakeLists.txt
@@ -25,7 +25,15 @@
 # This modified file is released under the same license.
 # --------------------------------------------------------------------------
 
-bolt_add_library(bolt_row_fast CompactRow.cpp UnsafeRowFast.cpp)
+bolt_add_library(
+  bolt_row_fast
+  CompactRow.cpp
+  UnsafeRowFast.cpp
+  dense/DenseRow.cpp
+  dense/DenseRowGeneralEncode.cpp
+  dense/DenseRowGeneralDecode.cpp
+  dense/DenseRowScalarEncode.cpp
+  dense/DenseRowScalarDecode.cpp)
 
 target_link_libraries(bolt_row_fast PUBLIC bolt_vector)
 
diff --git a/bolt/row/RowFormat.h b/bolt/row/RowFormat.h
new file mode 100644
index 000000000..0029a9a1a
--- /dev/null
+++ b/bolt/row/RowFormat.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace bytedance::bolt::row {
+
+enum class RowFormat : uint8_t {
+  DENSE = 0,
+  COMPACT = 1,
+};
+
+} // namespace bytedance::bolt::row
diff --git a/bolt/row/benchmark/CMakeLists.txt b/bolt/row/benchmark/CMakeLists.txt
index bb6dd5fc1..a9578b4e7 100644
--- a/bolt/row/benchmark/CMakeLists.txt
+++ b/bolt/row/benchmark/CMakeLists.txt
@@ -14,9 +14,20 @@
 # limitations under the License.
 
 add_executable(unsafe_row_serialize_benchmark UnsafeRowSerializeBenchmark.cpp)
+add_executable(dense_row_serialize_benchmark DenseRowSerializeBenchmark.cpp)
 
 target_link_libraries(
   unsafe_row_serialize_benchmark
+  bolt_row_fast
+  bolt_vector_fuzzer
+  bolt_testutils
+  ${FOLLY_BENCHMARK}
+  GTest::gtest
+)
+
+target_link_libraries(
+  dense_row_serialize_benchmark
+  bolt_row_fast
   bolt_vector_fuzzer
   bolt_testutils
   ${FOLLY_BENCHMARK}
diff --git a/bolt/row/benchmark/DenseRowSerializeBenchmark.cpp b/bolt/row/benchmark/DenseRowSerializeBenchmark.cpp
new file mode 100644
index 000000000..9d18a0c4a
--- /dev/null
+++ b/bolt/row/benchmark/DenseRowSerializeBenchmark.cpp
@@ -0,0 +1,913 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <folly/Benchmark.h>
+#include <folly/init/Init.h>
+
+#include <array>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <optional>
+#include <random>
+#include <string_view>
+#include <vector>
+
+#include "bolt/row/CompactRow.h"
+#include "bolt/row/UnsafeRowDeserializers.h"
+#include "bolt/row/UnsafeRowFast.h"
+#include "bolt/row/dense/DenseRow.h"
+#include "bolt/row/dense/DenseRowScalar.h"
+#include "bolt/row/dense/IntVarint.h"
+#include "bolt/vector/FlatVector.h"
+#include "bolt/vector/fuzzer/VectorFuzzer.h"
+
+namespace bytedance::bolt::row {
+namespace {
+
+// Serialize a RowVector to one contiguous buffer + (N + 1) cumulative row
+// offsets via DenseRow (the self-allocating shape the benchmark relies on).
+// DenseRow is marker-less, so strip any top-level nulls first.
+struct DenseSerialized {
+  BufferPtr buffer;
+  BufferPtr rowOffsets;
+};
+
+DenseSerialized denseSerialize(
+    const RowVectorPtr& data,
+    memory::MemoryPool* pool) {
+  RowVectorPtr input = data;
+  if (data->mayHaveNulls()) {
+    input = std::make_shared<RowVector>(
+        pool, data->type(), /*nulls=*/nullptr, data->size(), data->children());
+  }
+  DenseRow rows(input);
+  const auto n = rows.numRows();
+  auto offsetsBuf = AlignedBuffer::allocate<size_t>(n + 1, pool);
+  auto* offs = offsetsBuf->asMutable<size_t>();
+  size_t cum = 0;
+  for (vector_size_t r = 0; r < n; ++r) {
+    offs[r] = cum;
+    cum += rows.rowSizes()[r];
+  }
+  offs[n] = cum;
+  auto buf = AlignedBuffer::allocate<char>(std::max<size_t>(cum, 1u), pool);
+  rows.serialize(
+      reinterpret_cast<uint8_t*>(buf->asMutable<char>()),
+      folly::Range<const size_t*>(offs, n));
+  return {std::move(buf), std::move(offsetsBuf)};
+}
+
+enum class SerdeDataKind {
+  kDefault,
+  kBigintScalar,
+  kBigintArray,
+  kBigintNestedArray,
+  kBigintMap,
+  kDoubleRandom,
+  kStringLen8,
+  kStringLen100,
+  kMultiScalar5Small,
+  kMultiScalar10Small,
+};
+
+struct BigintRange {
+  int64_t minInclusive;
+  int64_t maxInclusive;
+};
+
+struct SerdeOnlyBenchmarkCase {
+  RowTypePtr rowType;
+  SerdeDataKind dataKind{SerdeDataKind::kDefault};
+  // For kBigint* data kinds: nullopt means full int64 range (i.e., the
+  // previous "random" case); otherwise the BIGINT values are drawn
+  // uniformly from [min, max].
+  std::optional<BigintRange> bigintRange{std::nullopt};
+  // Fraction of null child values, applied on the kDefault fuzz path. Lets a
+  // case exercise the null-handling (non-fast) encode/decode path.
+  double nullRatio{0.0};
+};
+
+RowVectorPtr makeRangeBigintData(
+    VectorFuzzer& fuzzer,
+    int64_t minValueInclusive,
+    int64_t maxValueInclusive) {
+  using namespace generator_spec_maker;
+
+  BOLT_CHECK_LE(minValueInclusive, maxValueInclusive);
+
+  auto rowSpec = RANDOM_ROW({RANDOM_BIGINT(
+      [minValueInclusive, maxValueInclusive](FuzzerGenerator& rng) -> int64_t {
+        return std::uniform_int_distribution<int64_t>(
+            minValueInclusive, maxValueInclusive)(rng);
+      })});
+  auto vector = fuzzer.fuzz(*rowSpec);
+  auto rowVector = std::dynamic_pointer_cast<RowVector>(vector);
+  BOLT_CHECK_NOT_NULL(rowVector);
+  return rowVector;
+}
+
+// Returns a generator that produces BIGINT values uniformly distributed in
+// [min, max]. Captured into RANDOM_BIGINT specs below.
+auto makeBigintGen(int64_t minValueInclusive, int64_t maxValueInclusive) {
+  return
+      [minValueInclusive, maxValueInclusive](FuzzerGenerator& rng) -> int64_t {
+        return std::uniform_int_distribution<int64_t>(
+            minValueInclusive, maxValueInclusive)(rng);
+      };
+}
+
+// Generator that produces array/map sizes uniformly in [0, 10]. Matches
+// VectorFuzzer's default container-length distribution.
+auto containerSizeGen() {
+  return [](FuzzerGenerator& rng) -> vector_size_t {
+    return std::uniform_int_distribution<vector_size_t>(0, 10)(rng);
+  };
+}
+
+RowVectorPtr makeRangeBigintArrayData(
+    VectorFuzzer& fuzzer,
+    int64_t minValueInclusive,
+    int64_t maxValueInclusive) {
+  using namespace generator_spec_maker;
+  BOLT_CHECK_LE(minValueInclusive, maxValueInclusive);
+
+  // ROW({BIGINT, ARRAY(BIGINT in [min,max])}).
+  auto rowSpec = RANDOM_ROW({
+      RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)),
+      RANDOM_ARRAY(
+          RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)),
+          containerSizeGen()),
+  });
+  auto vector = fuzzer.fuzz(*rowSpec);
+  auto rowVector = std::dynamic_pointer_cast<RowVector>(vector);
+  BOLT_CHECK_NOT_NULL(rowVector);
+  return rowVector;
+}
+
+RowVectorPtr makeRangeBigintNestedArrayData(
+    VectorFuzzer& fuzzer,
+    int64_t minValueInclusive,
+    int64_t maxValueInclusive) {
+  using namespace generator_spec_maker;
+  BOLT_CHECK_LE(minValueInclusive, maxValueInclusive);
+
+  // ROW({BIGINT, ARRAY(ARRAY(BIGINT in [min,max]))}).
+  auto rowSpec = RANDOM_ROW({
+      RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)),
+      RANDOM_ARRAY(
+          RANDOM_ARRAY(
+              RANDOM_BIGINT(
+                  makeBigintGen(minValueInclusive, maxValueInclusive)),
+              containerSizeGen()),
+          containerSizeGen()),
+  });
+  auto vector = fuzzer.fuzz(*rowSpec);
+  auto rowVector = std::dynamic_pointer_cast<RowVector>(vector);
+  BOLT_CHECK_NOT_NULL(rowVector);
+  return rowVector;
+}
+
+// Flat 5-column ROW with the BIGINT constrained to [min, max] and the other
+// four narrow-/fixed-width columns (DOUBLE, BOOLEAN, TINYINT, REAL) drawn
+// from their default distributions. Used to measure the multi-scalar path
+// when the bigint values fit in 1-byte varints.
+RowVectorPtr makeMultiScalar5SmallData(
+    VectorFuzzer& fuzzer,
+    int64_t minValueInclusive,
+    int64_t maxValueInclusive) {
+  using namespace generator_spec_maker;
+  auto rowSpec = RANDOM_ROW({
+      RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)),
+      RANDOM_DOUBLE([](FuzzerGenerator& rng) -> double {
+        return std::uniform_real_distribution<double>(-1.0, 1.0)(rng);
+      }),
+      RANDOM_BOOLEAN([](FuzzerGenerator& rng) -> bool {
+        return std::uniform_int_distribution<int>(0, 1)(rng) != 0;
+      }),
+      RANDOM_TINYINT([](FuzzerGenerator& rng) -> int8_t {
+        return std::uniform_int_distribution<int>(-127, 127)(rng);
+      }),
+      RANDOM_REAL([](FuzzerGenerator& rng) -> float {
+        return std::uniform_real_distribution<float>(-1.0f, 1.0f)(rng);
+      }),
+  });
+  auto vector = fuzzer.fuzz(*rowSpec);
+  auto rowVector = std::dynamic_pointer_cast<RowVector>(vector);
+  BOLT_CHECK_NOT_NULL(rowVector);
+  return rowVector;
+}
+
+// Flat 10-column ROW where every integer-width column (BIGINT, INTEGER,
+// SMALLINT, TINYINT) is constrained to [min, max] so its varint encoding
+// is short. DOUBLE / REAL stay fixed-width.
+RowVectorPtr makeMultiScalar10SmallData(
+    VectorFuzzer& fuzzer,
+    int64_t minValueInclusive,
+    int64_t maxValueInclusive) {
+  using namespace generator_spec_maker;
+  auto narrowReal = [](FuzzerGenerator& rng) -> float {
+    return std::uniform_real_distribution<float>(-1.0f, 1.0f)(rng);
+  };
+  auto narrowDouble = [](FuzzerGenerator& rng) -> double {
+    return std::uniform_real_distribution<double>(-1.0, 1.0)(rng);
+  };
+  auto narrowBool = [](FuzzerGenerator& rng) -> bool {
+    return std::uniform_int_distribution<int>(0, 1)(rng) != 0;
+  };
+  const auto clamp32 = [&](int64_t v) -> int32_t {
+    return static_cast<int32_t>(std::clamp(
+        v,
+        static_cast<int64_t>(std::numeric_limits<int32_t>::min()),
+        static_cast<int64_t>(std::numeric_limits<int32_t>::max())));
+  };
+  const auto clamp16 = [&](int64_t v) -> int16_t {
+    return static_cast<int16_t>(std::clamp(
+        v,
+        static_cast<int64_t>(std::numeric_limits<int16_t>::min()),
+        static_cast<int64_t>(std::numeric_limits<int16_t>::max())));
+  };
+  const auto clamp8 = [&](int64_t v) -> int8_t {
+    return static_cast<int8_t>(std::clamp(
+        v,
+        static_cast<int64_t>(std::numeric_limits<int8_t>::min()),
+        static_cast<int64_t>(std::numeric_limits<int8_t>::max())));
+  };
+  const int32_t intMin = clamp32(minValueInclusive);
+  const int32_t intMax = clamp32(maxValueInclusive);
+  const int16_t smallMin = clamp16(minValueInclusive);
+  const int16_t smallMax = clamp16(maxValueInclusive);
+  const int8_t tinyMin = clamp8(minValueInclusive);
+  const int8_t tinyMax = clamp8(maxValueInclusive);
+
+  auto rowSpec = RANDOM_ROW({
+      RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)),
+      RANDOM_INTEGER([intMin, intMax](FuzzerGenerator& rng) -> int32_t {
+        return std::uniform_int_distribution<int32_t>(intMin, intMax)(rng);
+      }),
+      RANDOM_SMALLINT([smallMin, smallMax](FuzzerGenerator& rng) -> int16_t {
+        return static_cast<int16_t>(
+            std::uniform_int_distribution<int>(smallMin, smallMax)(rng));
+      }),
+      RANDOM_TINYINT([tinyMin, tinyMax](FuzzerGenerator& rng) -> int8_t {
+        return static_cast<int8_t>(
+            std::uniform_int_distribution<int>(tinyMin, tinyMax)(rng));
+      }),
+      RANDOM_REAL(narrowReal),
+      RANDOM_DOUBLE(narrowDouble),
+      RANDOM_BOOLEAN(narrowBool),
+      RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)),
+      RANDOM_INTEGER([intMin, intMax](FuzzerGenerator& rng) -> int32_t {
+        return std::uniform_int_distribution<int32_t>(intMin, intMax)(rng);
+      }),
+      RANDOM_DOUBLE(narrowDouble),
+  });
+  auto vector = fuzzer.fuzz(*rowSpec);
+  auto rowVector = std::dynamic_pointer_cast<RowVector>(vector);
+  BOLT_CHECK_NOT_NULL(rowVector);
+  return rowVector;
+}
+
+RowVectorPtr makeRangeBigintMapData(
+    VectorFuzzer& fuzzer,
+    int64_t minValueInclusive,
+    int64_t maxValueInclusive) {
+  using namespace generator_spec_maker;
+  BOLT_CHECK_LE(minValueInclusive, maxValueInclusive);
+
+  // ROW({BIGINT, MAP(BIGINT in [min,max], REAL)}).
+  auto rowSpec = RANDOM_ROW({
+      RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)),
+      RANDOM_MAP(
+          RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)),
+          RANDOM_REAL([](FuzzerGenerator& rng) -> float {
+            return std::uniform_real_distribution<float>(-1.0f, 1.0f)(rng);
+          }),
+          containerSizeGen()),
+  });
+  auto vector = fuzzer.fuzz(*rowSpec);
+  auto rowVector = std::dynamic_pointer_cast<RowVector>(vector);
+  BOLT_CHECK_NOT_NULL(rowVector);
+  return rowVector;
+}
+
+// For kBigintScalar / kBigintArray with a range, falls back to the
+// fuzzer's full-range generator if `range` is nullopt.
+constexpr BigintRange kFullBigintRange{
+    std::numeric_limits<int64_t>::min(),
+    std::numeric_limits<int64_t>::max()};
+
+RowVectorPtr makeSerdeOnlyData(
+    const SerdeOnlyBenchmarkCase& benchmarkCase,
+    memory::MemoryPool* pool) {
+  VectorFuzzer::Options options;
+  options.vectorSize = 1'000;
+  options.nullRatio = benchmarkCase.nullRatio;
+
+  if (benchmarkCase.dataKind == SerdeDataKind::kStringLen8) {
+    options.stringLength = 8;
+    options.stringVariableLength = false;
+  }
+
+  if (benchmarkCase.dataKind == SerdeDataKind::kStringLen100) {
+    options.stringLength = 100;
+    options.stringVariableLength = false;
+  }
+
+  const auto seed = 1;
+  VectorFuzzer fuzzer(options, pool, seed);
+
+  switch (benchmarkCase.dataKind) {
+    case SerdeDataKind::kBigintScalar: {
+      const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange);
+      return makeRangeBigintData(
+          fuzzer, range.minInclusive, range.maxInclusive);
+    }
+    case SerdeDataKind::kBigintArray: {
+      const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange);
+      return makeRangeBigintArrayData(
+          fuzzer, range.minInclusive, range.maxInclusive);
+    }
+    case SerdeDataKind::kBigintNestedArray: {
+      const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange);
+      return makeRangeBigintNestedArrayData(
+          fuzzer, range.minInclusive, range.maxInclusive);
+    }
+    case SerdeDataKind::kBigintMap: {
+      const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange);
+      return makeRangeBigintMapData(
+          fuzzer, range.minInclusive, range.maxInclusive);
+    }
+    case SerdeDataKind::kMultiScalar5Small: {
+      const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange);
+      return makeMultiScalar5SmallData(
+          fuzzer, range.minInclusive, range.maxInclusive);
+    }
+    case SerdeDataKind::kMultiScalar10Small: {
+      const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange);
+      return makeMultiScalar10SmallData(
+          fuzzer, range.minInclusive, range.maxInclusive);
+    }
+    case SerdeDataKind::kDoubleRandom:
+    case SerdeDataKind::kStringLen8:
+    case SerdeDataKind::kStringLen100:
+    case SerdeDataKind::kDefault:
+      // fuzzFlat (not fuzzInputRow): guarantee flat children. fuzzInputRow may
+      // wrap a column in a dictionary, which adds decode/null-merge cost
+      // (DecodedVector::setFlatNulls) unrelated to the row codec.
+      return std::dynamic_pointer_cast<RowVector>(
+          fuzzer.fuzzFlat(benchmarkCase.rowType));
+  }
+
+  BOLT_UNREACHABLE();
+}
+
+size_t computeUnsafeTotalSize(
+    UnsafeRowFast& unsafeRow,
+    const RowTypePtr& rowType,
+    vector_size_t numRows) {
+  size_t totalSize = 0;
+  if (auto fixedRowSize = UnsafeRowFast::fixedRowSize(rowType)) {
+    totalSize += fixedRowSize.value() * numRows;
+  } else {
+    for (auto i = 0; i < numRows; ++i) {
+      totalSize += unsafeRow.rowSize(i);
+    }
+  }
+  return totalSize;
+}
+
+size_t computeCompactTotalSize(
+    CompactRow& compactRow,
+    const RowTypePtr& rowType,
+    vector_size_t numRows) {
+  size_t totalSize = 0;
+  if (auto fixedRowSize = CompactRow::fixedRowSize(rowType)) {
+    totalSize += fixedRowSize.value() * numRows;
+  } else {
+    for (auto i = 0; i < numRows; ++i) {
+      totalSize += compactRow.rowSize(i);
+    }
+  }
+  return totalSize;
+}
+
+size_t serializeUnsafeToBuffer(
+    UnsafeRowFast& unsafeRow,
+    vector_size_t numRows,
+    char* rawBuffer) {
+  size_t offset = 0;
+  for (auto i = 0; i < numRows; ++i) {
+    offset += unsafeRow.serialize(i, rawBuffer + offset);
+  }
+  return offset;
+}
+
+size_t serializeCompactToBuffer(
+    CompactRow& compactRow,
+    vector_size_t numRows,
+    char* rawBuffer) {
+  size_t offset = 0;
+  for (auto i = 0; i < numRows; ++i) {
+    offset += compactRow.serialize(i, rawBuffer + offset);
+  }
+  return offset;
+}
+
+std::vector<std::optional<std::string_view>> serializeUnsafeRows(
+    UnsafeRowFast& unsafeRow,
+    vector_size_t numRows,
+    BufferPtr& buffer) {
+  std::vector<std::optional<std::string_view>> serialized;
+  serialized.reserve(numRows);
+  auto* rawBuffer = buffer->asMutable<char>();
+
+  size_t offset = 0;
+  for (auto i = 0; i < numRows; ++i) {
+    auto rowSize = unsafeRow.serialize(i, rawBuffer + offset);
+    serialized.push_back(std::string_view(rawBuffer + offset, rowSize));
+    offset += rowSize;
+  }
+
+  BOLT_CHECK_EQ(buffer->size(), offset);
+  return serialized;
+}
+
+std::vector<std::string_view> serializeCompactRows(
+    CompactRow& compactRow,
+    vector_size_t numRows,
+    BufferPtr& buffer) {
+  std::vector<std::string_view> serialized;
+  serialized.reserve(numRows);
+  auto* rawBuffer = buffer->asMutable<char>();
+
+  size_t offset = 0;
+  for (auto i = 0; i < numRows; ++i) {
+    auto rowSize = compactRow.serialize(i, rawBuffer + offset);
+    serialized.push_back(std::string_view(rawBuffer + offset, rowSize));
+    offset += rowSize;
+  }
+
+  BOLT_CHECK_EQ(buffer->size(), offset);
+  return serialized;
+}
+
+int unsafeSer(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) {
+  auto pool = memory::memoryManager()->addLeafPool();
+
+  folly::BenchmarkSuspender suspender;
+  auto data = makeSerdeOnlyData(benchmarkCase, pool.get());
+  suspender.dismiss();
+
+  // Full Vector -> buffer: build the serializer, size it, allocate, write.
+  for (int i = 0; i < nIters; ++i) {
+    UnsafeRowFast unsafeRow(data);
+    const auto totalSize =
+        computeUnsafeTotalSize(unsafeRow, benchmarkCase.rowType, data->size());
+    auto buffer = AlignedBuffer::allocate<char>(totalSize, pool.get());
+    folly::doNotOptimizeAway(serializeUnsafeToBuffer(
+        unsafeRow, data->size(), buffer->asMutable<char>()));
+  }
+  return nIters * data->size();
+}
+
+int compactSer(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) {
+  auto pool = memory::memoryManager()->addLeafPool();
+
+  folly::BenchmarkSuspender suspender;
+  auto data = makeSerdeOnlyData(benchmarkCase, pool.get());
+  const auto numRows = data->size();
+  const auto fixed = CompactRow::fixedRowSize(benchmarkCase.rowType);
+  suspender.dismiss();
+
+  // Full Vector -> buffer: build CompactRow, compute per-row offsets (its size
+  // pass), allocate (pre-zeroed for null-bit handling), batch serialize.
+  for (int i = 0; i < nIters; ++i) {
+    CompactRow compactRow(data);
+    std::vector<size_t> offsets(numRows);
+    size_t cum = 0;
+    for (vector_size_t r = 0; r < numRows; ++r) {
+      offsets[r] = cum;
+      cum += fixed ? *fixed : static_cast<size_t>(compactRow.rowSize(r));
+    }
+    auto buffer =
+        AlignedBuffer::allocate<char>(std::max<size_t>(cum, 1u), pool.get(), 0);
+    compactRow.serialize(0, numRows, offsets.data(), buffer->asMutable<char>());
+    folly::doNotOptimizeAway(buffer);
+  }
+  return nIters * numRows;
+}
+
+int unsafeDeser(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) {
+  auto pool = memory::memoryManager()->addLeafPool();
+
+  folly::BenchmarkSuspender suspender;
+  auto data = makeSerdeOnlyData(benchmarkCase, pool.get());
+  UnsafeRowFast unsafeRow(data);
+  const auto totalSize =
+      computeUnsafeTotalSize(unsafeRow, benchmarkCase.rowType, data->size());
+  auto buffer = AlignedBuffer::allocate<char>(totalSize, pool.get());
+  auto serialized = serializeUnsafeRows(unsafeRow, data->size(), buffer);
+  suspender.dismiss();
+
+  for (int i = 0; i < nIters; ++i) {
+    folly::doNotOptimizeAway(UnsafeRowDeserializer::deserialize(
+        serialized, benchmarkCase.rowType, pool.get()));
+  }
+  return nIters * data->size();
+}
+
+int compactDeser(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) {
+  auto pool = memory::memoryManager()->addLeafPool();
+
+  folly::BenchmarkSuspender suspender;
+  auto data = makeSerdeOnlyData(benchmarkCase, pool.get());
+  CompactRow compactRow(data);
+  const auto totalSize =
+      computeCompactTotalSize(compactRow, benchmarkCase.rowType, data->size());
+  auto buffer = AlignedBuffer::allocate<char>(totalSize, pool.get());
+  auto serialized = serializeCompactRows(compactRow, data->size(), buffer);
+  suspender.dismiss();
+
+  for (int i = 0; i < nIters; ++i) {
+    folly::doNotOptimizeAway(
+        CompactRow::deserialize(serialized, benchmarkCase.rowType, pool.get()));
+  }
+  return nIters * data->size();
+}
+
+// Register a serde benchmark (func) for one case, shown as func(label). We use
+// addBenchmark directly (not BENCHMARK_NAMED_PARAM_MULTI) so the lambda can
+// print a one-line progress message to stderr the first time it runs — folly's
+// results table only prints at the very end, so this is how you see which
+// benchmark is running.
+#define SERDE_BENCH(func, label, benchmarkCase)                      \
+  FOLLY_MAYBE_UNUSED static bool FB_ANONYMOUS_VARIABLE(serdeBench) = \
+      (::folly::addBenchmark(                                        \
+           __FILE__,                                                 \
+           #func "(" #label ")",                                     \
+           [](unsigned nIters) -> unsigned {                         \
+             return func(nIters, benchmarkCase);                     \
+           }),                                                       \
+       true)
+
+int denseSer(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) {
+  auto pool = memory::memoryManager()->addLeafPool();
+
+  folly::BenchmarkSuspender suspender;
+  auto data = makeSerdeOnlyData(benchmarkCase, pool.get());
+  suspender.dismiss();
+
+  // Full Vector -> buffer: denseSerialize builds the DenseRow (which runs the
+  // size pass — addColumnSizes), computes offsets, allocates, and serializes
+  // (it also strips top-level nulls, since DenseRow is marker-less).
+  for (int i = 0; i < nIters; ++i) {
+    folly::doNotOptimizeAway(denseSerialize(data, pool.get()).buffer);
+  }
+  return nIters * data->size();
+}
+
+int denseDeser(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) {
+  auto pool = memory::memoryManager()->addLeafPool();
+
+  folly::BenchmarkSuspender suspender;
+  auto data = makeSerdeOnlyData(benchmarkCase, pool.get());
+  auto serialized = denseSerialize(data, pool.get());
+  const auto* bytes = serialized.buffer->as<char>();
+  const auto* offsets = serialized.rowOffsets->as<size_t>();
+  const auto rowCount = data->size();
+  std::vector<std::string_view> rows;
+  rows.reserve(rowCount);
+  for (vector_size_t i = 0; i < rowCount; ++i) {
+    rows.emplace_back(bytes + offsets[i], offsets[i + 1] - offsets[i]);
+  }
+  suspender.dismiss();
+
+  for (int i = 0; i < nIters; ++i) {
+    folly::doNotOptimizeAway(
+        DenseRow::deserialize(rows, benchmarkCase.rowType, pool.get()));
+  }
+  return nIters * data->size();
+}
+
+// The dense size pass in isolation: DenseRow construction = decode +
+// addColumnSizes (no buffer write). Lets you see how much of denseSer is the
+// size pass vs the byte writing.
+int denseSizePass(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) {
+  auto pool = memory::memoryManager()->addLeafPool();
+
+  folly::BenchmarkSuspender suspender;
+  auto data = makeSerdeOnlyData(benchmarkCase, pool.get());
+  RowVectorPtr input = data;
+  if (data->mayHaveNulls()) {
+    input = std::make_shared<RowVector>(
+        pool.get(), data->type(), nullptr, data->size(), data->children());
+  }
+  suspender.dismiss();
+
+  for (int i = 0; i < nIters; ++i) {
+    DenseRow rows(input);
+    folly::doNotOptimizeAway(rows.rowSizes()[0]);
+  }
+  return nIters * data->size();
+}
+
+// Register every format for one case as a single adjacent block, so the
+// results table groups them for comparison: the three serializers, then the
+// dense size pass, then the three deserializers. Cases are invoked in a
+// comparable order below (scalars, then each container type with its
+// value-range variants).
+#define CASE_BENCHMARKS(name, benchmarkCase)       \
+  SERDE_BENCH(unsafeSer, name, benchmarkCase);     \
+  SERDE_BENCH(compactSer, name, benchmarkCase);    \
+  SERDE_BENCH(denseSer, name, benchmarkCase);      \
+  SERDE_BENCH(denseSizePass, name, benchmarkCase); \
+  SERDE_BENCH(unsafeDeser, name, benchmarkCase);   \
+  SERDE_BENCH(compactDeser, name, benchmarkCase);  \
+  SERDE_BENCH(denseDeser, name, benchmarkCase)
+
+constexpr BigintRange kRangeLt2Pow8{-((1LL << 8) - 1), (1LL << 8) - 1};
+constexpr BigintRange kRangeLt2Pow32{-((1LL << 32) - 1), (1LL << 32) - 1};
+
+const SerdeOnlyBenchmarkCase kBigintLt2Pow8{
+    ROW({BIGINT()}),
+    SerdeDataKind::kBigintScalar,
+    kRangeLt2Pow8};
+const SerdeOnlyBenchmarkCase kBigintLt2Pow32{
+    ROW({BIGINT()}),
+    SerdeDataKind::kBigintScalar,
+    kRangeLt2Pow32};
+const SerdeOnlyBenchmarkCase kBigintRandom{
+    ROW({BIGINT()}),
+    SerdeDataKind::kBigintScalar};
+// Full-range BIGINT with ~40% null children: exercises the null-handling
+// (non-fast) encode/decode path that the SIMD/contiguous fast paths skip.
+const SerdeOnlyBenchmarkCase kBigintRandomNullable{
+    ROW({BIGINT()}),
+    SerdeDataKind::kDefault,
+    std::nullopt,
+    0.4};
+const SerdeOnlyBenchmarkCase kDoubleRandom{
+    ROW({DOUBLE()}),
+    SerdeDataKind::kDoubleRandom};
+const SerdeOnlyBenchmarkCase kStringLen8{
+    ROW({VARCHAR()}),
+    SerdeDataKind::kStringLen8};
+const SerdeOnlyBenchmarkCase kStringLen100{
+    ROW({VARCHAR()}),
+    SerdeDataKind::kStringLen100};
+const SerdeOnlyBenchmarkCase kArrays{
+    ROW({BIGINT(), ARRAY(BIGINT())}),
+    SerdeDataKind::kDefault};
+const SerdeOnlyBenchmarkCase kNestedArrays{
+    ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}),
+    SerdeDataKind::kDefault};
+const SerdeOnlyBenchmarkCase kMaps{
+    ROW({BIGINT(), MAP(BIGINT(), REAL())}),
+    SerdeDataKind::kDefault};
+const SerdeOnlyBenchmarkCase kArraysBigintLt2Pow8{
+    ROW({BIGINT(), ARRAY(BIGINT())}),
+    SerdeDataKind::kBigintArray,
+    kRangeLt2Pow8};
+const SerdeOnlyBenchmarkCase kArraysBigintLt2Pow32{
+    ROW({BIGINT(), ARRAY(BIGINT())}),
+    SerdeDataKind::kBigintArray,
+    kRangeLt2Pow32};
+const SerdeOnlyBenchmarkCase kNestedArraysBigintLt2Pow8{
+    ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}),
+    SerdeDataKind::kBigintNestedArray,
+    kRangeLt2Pow8};
+const SerdeOnlyBenchmarkCase kNestedArraysBigintLt2Pow32{
+    ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}),
+    SerdeDataKind::kBigintNestedArray,
+    kRangeLt2Pow32};
+const SerdeOnlyBenchmarkCase kMapsBigintLt2Pow8{
+    ROW({BIGINT(), MAP(BIGINT(), REAL())}),
+    SerdeDataKind::kBigintMap,
+    kRangeLt2Pow8};
+const SerdeOnlyBenchmarkCase kMapsBigintLt2Pow32{
+    ROW({BIGINT(), MAP(BIGINT(), REAL())}),
+    SerdeDataKind::kBigintMap,
+    kRangeLt2Pow32};
+
+// Flat row of multiple simple-type columns. Exercises the top-level ROW
+// driver against scalar leaf encoders (no nested ARRAY/MAP), which is the
+// path most directly comparable to CompactRow's strength.
+const SerdeOnlyBenchmarkCase kMultiScalar5{
+    ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()}),
+    SerdeDataKind::kDefault};
+const SerdeOnlyBenchmarkCase kMultiScalar10{
+    ROW(
+        {BIGINT(),
+         INTEGER(),
+         SMALLINT(),
+         TINYINT(),
+         REAL(),
+         DOUBLE(),
+         BOOLEAN(),
+         BIGINT(),
+         INTEGER(),
+         DOUBLE()}),
+    SerdeDataKind::kDefault};
+
+// Small-value variants: BIGINT (and INTEGER/SMALLINT/TINYINT for the 10-col
+// case) restricted to [-(2^8-1), 2^8-1] so every integer encodes in a
+// single varint byte. Highlights dense's strength on narrow scalar data
+// where its on-wire size drops well below CompactRow's fixed widths.
+const SerdeOnlyBenchmarkCase kMultiScalar5SmallLt2Pow8{
+    ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()}),
+    SerdeDataKind::kMultiScalar5Small,
+    kRangeLt2Pow8};
+const SerdeOnlyBenchmarkCase kMultiScalar10SmallLt2Pow8{
+    ROW(
+        {BIGINT(),
+         INTEGER(),
+         SMALLINT(),
+         TINYINT(),
+         REAL(),
+         DOUBLE(),
+         BOOLEAN(),
+         BIGINT(),
+         INTEGER(),
+         DOUBLE()}),
+    SerdeDataKind::kMultiScalar10Small,
+    kRangeLt2Pow8};
+
+// --- Scalars: BIGINT swept by value range, then double / strings. ---
+CASE_BENCHMARKS(bigint_lt_2pow8, kBigintLt2Pow8);
+CASE_BENCHMARKS(bigint_lt_2pow32, kBigintLt2Pow32);
+CASE_BENCHMARKS(bigint_random, kBigintRandom);
+CASE_BENCHMARKS(bigint_random_nullable, kBigintRandomNullable);
+CASE_BENCHMARKS(double_random, kDoubleRandom);
+CASE_BENCHMARKS(string_len8, kStringLen8);
+CASE_BENCHMARKS(string_len100, kStringLen100);
+
+// --- Multi-column flat rows: 5- and 10-column, full vs small-int. ---
+CASE_BENCHMARKS(multiScalar5, kMultiScalar5);
+CASE_BENCHMARKS(multiScalar5_small_lt_2pow8, kMultiScalar5SmallLt2Pow8);
+CASE_BENCHMARKS(multiScalar10, kMultiScalar10);
+CASE_BENCHMARKS(multiScalar10_small_lt_2pow8, kMultiScalar10SmallLt2Pow8);
+
+// --- Containers: each type next to its value-range variants. ---
+CASE_BENCHMARKS(arrays, kArrays);
+CASE_BENCHMARKS(arrays_bigint_lt_2pow8, kArraysBigintLt2Pow8);
+CASE_BENCHMARKS(arrays_bigint_lt_2pow32, kArraysBigintLt2Pow32);
+CASE_BENCHMARKS(nestedArrays, kNestedArrays);
+CASE_BENCHMARKS(nestedArrays_bigint_lt_2pow8, kNestedArraysBigintLt2Pow8);
+CASE_BENCHMARKS(nestedArrays_bigint_lt_2pow32, kNestedArraysBigintLt2Pow32);
+CASE_BENCHMARKS(maps, kMaps);
+CASE_BENCHMARKS(maps_bigint_lt_2pow8, kMapsBigintLt2Pow8);
+CASE_BENCHMARKS(maps_bigint_lt_2pow32, kMapsBigintLt2Pow32);
+
+} // namespace
+
+// ===========================================================================
+// Size-pass microbenchmark: times the REAL scalar::addColumnSizes on a flat
+// nullable BIGINT column across value magnitudes. Calling the compiled function
+// (not a local copy) keeps it from auto-vectorizing in this TU, so this matches
+// the production embedded behavior. A/B the internal kernel by toggling the
+// SIMD wiring in scalar::addColumnSizes and rebuilding.
+// ===========================================================================
+namespace size_bench {
+constexpr vector_size_t kN = 4096;
+
+struct SizeInput {
+  VectorPtr vec; // flat nullable BIGINT
+  DecodedVector decoded;
+  std::vector<size_t> rowSizes;
+};
+
+// magnitude: 0 = small [-100,100], 1 = full int32, 2 = full int64; ~10% nulls.
+std::unique_ptr<SizeInput> makeInput(memory::MemoryPool* pool, int mag) {
+  auto in = std::make_unique<SizeInput>();
+  in->vec = BaseVector::create(BIGINT(), kN, pool);
+  auto* flat = in->vec->asUnchecked<FlatVector<int64_t>>();
+  auto* raw = flat->mutableRawValues();
+  std::mt19937_64 rng(0x9E3779B97F4A7C15ull ^ static_cast<uint64_t>(mag));
+  for (vector_size_t i = 0; i < kN; ++i) {
+    if (mag == 0) {
+      raw[i] = static_cast<int64_t>(rng() % 201) - 100;
+    } else if (mag == 1) {
+      raw[i] = static_cast<int64_t>(static_cast<int32_t>(rng()));
+    } else {
+      raw[i] = static_cast<int64_t>(rng());
+    }
+    if (rng() % 10 == 0) {
+      flat->setNull(i, true);
+    }
+  }
+  in->decoded.decode(*in->vec);
+  in->rowSizes.assign(kN, 0);
+  return in;
+}
+
+// Dictionary-wrapped (reversed indices) → non-identity, so addColumnSizes takes
+// the SCALAR nullableInt64SerializedSize loop instead of the SIMD kernel.
+std::unique_ptr<SizeInput> makeDictInput(memory::MemoryPool* pool, int mag) {
+  auto in = makeInput(pool, mag); // reuse value/null generation
+  auto flat = in->vec; // the flat nullable BIGINT
+  auto indices = allocateIndices(kN, pool);
+  auto* idx = indices->asMutable<vector_size_t>();
+  for (vector_size_t i = 0; i < kN; ++i) {
+    idx[i] = kN - 1 - i; // reversed -> non-identity
+  }
+  in->vec = BaseVector::wrapInDictionary(nullptr, indices, kN, flat);
+  in->decoded.decode(*in->vec);
+  return in;
+}
+
+} // namespace size_bench
+
+#define SIZE_BENCH(tag, mag)                                          \
+  BENCHMARK(addColumnSizes_##tag) {                                   \
+    static auto pool = memory::memoryManager()->addLeafPool();        \
+    static auto in = size_bench::makeInput(pool.get(), mag);          \
+    dense_row::scalar::addColumnSizes(                                \
+        *BIGINT(), in->decoded, size_bench::kN, in->rowSizes.data()); \
+    folly::doNotOptimizeAway(in->rowSizes[0]);                        \
+  }                                                                   \
+  BENCHMARK(addColumnSizes_dict_##tag) {                              \
+    static auto pool = memory::memoryManager()->addLeafPool();        \
+    static auto in = size_bench::makeDictInput(pool.get(), mag);      \
+    dense_row::scalar::addColumnSizes(                                \
+        *BIGINT(), in->decoded, size_bench::kN, in->rowSizes.data()); \
+    folly::doNotOptimizeAway(in->rowSizes[0]);                        \
+  }
+
+SIZE_BENCH(small, 0)
+SIZE_BENCH(medium_i32, 1)
+SIZE_BENCH(large_i64, 2)
+
+// Printed once at the end (after folly's timing table): the serialized size of
+// each case in all three row formats (bytes/row). The benchmark table itself
+// shows only timings.
+void printSerializedSizes() {
+  struct NamedCase {
+    const char* name;
+    const SerdeOnlyBenchmarkCase* benchmarkCase;
+  };
+  static const NamedCase cases[] = {
+      {"bigint_lt_2pow8", &kBigintLt2Pow8},
+      {"bigint_lt_2pow32", &kBigintLt2Pow32},
+      {"bigint_random", &kBigintRandom},
+      {"bigint_random_nullable", &kBigintRandomNullable},
+      {"double_random", &kDoubleRandom},
+      {"string_len8", &kStringLen8},
+      {"string_len100", &kStringLen100},
+      {"multiScalar5", &kMultiScalar5},
+      {"multiScalar5_small_lt_2pow8", &kMultiScalar5SmallLt2Pow8},
+      {"multiScalar10", &kMultiScalar10},
+      {"multiScalar10_small_lt_2pow8", &kMultiScalar10SmallLt2Pow8},
+      {"arrays", &kArrays},
+      {"arrays_bigint_lt_2pow8", &kArraysBigintLt2Pow8},
+      {"arrays_bigint_lt_2pow32", &kArraysBigintLt2Pow32},
+      {"nestedArrays", &kNestedArrays},
+      {"nestedArrays_bigint_lt_2pow8", &kNestedArraysBigintLt2Pow8},
+      {"nestedArrays_bigint_lt_2pow32", &kNestedArraysBigintLt2Pow32},
+      {"maps", &kMaps},
+      {"maps_bigint_lt_2pow8", &kMapsBigintLt2Pow8},
+      {"maps_bigint_lt_2pow32", &kMapsBigintLt2Pow32},
+  };
+
+  std::printf("\n=== serialized size (bytes/row) ===\n");
+  std::printf("%-30s %8s %8s %8s\n", "case", "unsafe", "compact", "dense");
+  auto pool = memory::memoryManager()->addLeafPool();
+  for (const auto& nc : cases) {
+    auto data = makeSerdeOnlyData(*nc.benchmarkCase, pool.get());
+    const auto rows = data->size();
+    UnsafeRowFast unsafeRow(data);
+    CompactRow compactRow(data);
+    const auto u =
+        computeUnsafeTotalSize(unsafeRow, nc.benchmarkCase->rowType, rows) /
+        rows;
+    const auto c =
+        computeCompactTotalSize(compactRow, nc.benchmarkCase->rowType, rows) /
+        rows;
+    const auto d = denseSerialize(data, pool.get()).buffer->size() / rows;
+    std::printf("%-30s %8zu %8zu %8zu\n", nc.name, u, c, d);
+  }
+  std::fflush(stdout);
+}
+
+} // namespace bytedance::bolt::row
+
+int main(int argc, char** argv) {
+  folly::init(&argc, &argv);
+  bytedance::bolt::memory::MemoryManager::initialize({});
+  folly::runBenchmarks();
+  bytedance::bolt::row::printSerializedSizes();
+  return 0;
+}
diff --git a/bolt/row/dense/DenseRow.cpp b/bolt/row/dense/DenseRow.cpp
new file mode 100644
index 000000000..53e898a31
--- /dev/null
+++ b/bolt/row/dense/DenseRow.cpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bolt/row/dense/DenseRow.h"
+
+// =============================================================================
+// Wire format (in-tree spec — frozen, byte-identical across this codebase)
+// =============================================================================
+//
+// Encoding is purely TYPE-DRIVEN: every value is encoded according to its
+// vector's concrete type. A row's blob is the concatenation of its fields,
+// each encoded by its own type. There is no top-level row marker — the caller
+// frames rows and guarantees no top-level null rows.
+//
+// Core property: LEVEL-HOISTED. At every nesting level the structural bytes for
+// all of that level's positions (nested ROW markers / ARRAY|MAP cardinalities /
+// VARCHAR lengths) are written first, then the level descends into children.
+// Each row's whole blob (all levels) lives in that row's own byte range.
+//
+//   row_blob := encode(field_0) ... encode(field_{k-1})
+//
+//   encode(T):
+//     TINYINT/SMALLINT/INTEGER/BIGINT/TIMESTAMP:
+//         null -> 0x00 | INT64_MIN -> 0x80 0x00
+//         else  -> varint(zigzag(adjust(v))), adjust(v) = v > 0 ? v : v - 1
+//     BOOLEAN:           varint(0 = null | 1 = false | 2 = true)
+//     REAL:              4B LE float bits; sentinel 0x7fc00000 = null
+//                        (a non-null value colliding with the sentinel is
+//                        bit-flipped on encode and restored on decode)
+//     DOUBLE:            8B LE; sentinel 0x7ff8000000000000 = null (as above)
+//     VARCHAR/VARBINARY: varint(len + 1) (0 = null), then len payload bytes.
+//                        Under a multi-position level, ALL lengths are written
+//                        before ALL payloads.
+//     HUGEINT:           nullable-int64 of zigzag128(value)'s low 64 bits (its
+//                        0x00 sentinel = null, no separate tag); when non-null,
+//                        followed by varint(high 64 bits of zigzag128(value))
+//     UNKNOWN:           varint(0) (always null)
+//     ROW:               per position varint(0 = null | 1 = present), then
+//                        recurse each field (null positions emit only the
+//                        marker and are filtered from children via parentNulls)
+//     ARRAY:             per position varint(0 = null | cardinality + 1), then
+//                        recurse the element column over the child positions
+//     MAP:               per position varint(0 = null | cardinality + 1), then
+//                        recurse the keys column, then the values column
+//
+// Frozen invariants: the INT64_MIN sentinel, cardinality + 1, MAP's keys-then-
+// values segment ordering, and the level-hoisted ordering above. Empty
+// array/map (cardinality+1 = 1) is distinct from null (0).
+//
+// Routing is per top-level field by type: scalar fields take the scalar
+// column-at-a-time path, complex (ARRAY/MAP/ROW) fields take the general
+// column-batch path; a row whose fields are all scalar uses a dedicated
+// fast path that skips the general scaffolding entirely. The codec kernels
+// live in sibling TUs, all declared in DenseRowGeneral.h:
+//   * DenseRowGeneralEncode.cpp   general column-batch encode
+//   * DenseRowGeneralDecode.cpp   general column-batch decode
+//   * DenseRowScalar.{h,*.cpp}    scalar column fast path
+// This file is the public API layer (the DenseRow class) only.
+// =============================================================================
+
+#include <limits>
+#include <optional>
+#include <variant>
+#include <vector>
+
+#include "bolt/row/dense/DenseRowGeneral.h"
+#include "bolt/row/dense/DenseRowScalar.h"
+#include "bolt/vector/ComplexVector.h"
+#include "vector/DecodedVector.h"
+
+namespace bytedance::bolt::row {
+
+using namespace dense_row;
+
+struct DenseRow::State {
+  // Keeps the input column data alive for this DenseRow's lifetime.
+  RowVectorPtr rowVector;
+  vector_size_t numRows{0};
+  std::vector<size_t> rowSizes;
+  size_t totalSize{0};
+
+  // Routing is per top-level field by type: a scalar field is decoded and
+  // sized/written column-at-a-time (DecodecVector); a complex (ARRAY/MAP/ROW)
+  // field goes through the general column-batch path (ColumnPlan). Both vectors
+  // are sized to fieldCount; for field k exactly one entry is populated.
+  std::vector<std::variant<DecodedVector, ColumnPlan>> decodedOrPlans;
+
+  // Top-level slot view ({r, 1} per row) for the complex columns. Only built
+  // when the row has a complex field (an all-scalar row leaves it empty). The
+  // nested slot trees live in each ARRAY/MAP node's ColumnPlan::childSlots,
+  // built by the size pass and replayed by the write pass.
+  TopSlotView topView;
+};
+
+DenseRow::DenseRow(const RowVectorPtr& rowVector)
+    : state_(std::make_unique<State>()) {
+  auto& st = *state_;
+  st.rowVector = rowVector;
+  const auto numRows = rowVector->size();
+  st.numRows = numRows;
+  st.rowSizes.assign(numRows, 0);
+  const auto& rowType = rowVector->type()->asRow();
+  const auto fieldCount = rowType.size();
+
+  if (numRows > 0) {
+    // Route each top-level field by type: a scalar field is sized
+    // column-at-a-time straight into rowSizes; a complex (ARRAY/MAP/ROW) field
+    // runs the general SizeSink pass (which also builds the slot tree reused by
+    // the write pass) and accumulates into sizeSinks. The general scaffolding
+    // (slot view + sink array + plan slots) is allocated only when a complex
+    // field is present, so an all-scalar row pays nothing extra. Complex fields
+    // are visited in field order so the slot tree replays in that order during
+    // serialize().
+    bool anyComplex = false;
+    for (size_t k = 0; k < fieldCount; ++k) {
+      if (!rowType.childAt(k)->isPrimitiveType()) {
+        anyComplex = true;
+        break;
+      }
+    }
+
+    st.decodedOrPlans.resize(fieldCount);
+    std::vector<SizeSink> sizeSinks;
+    if (anyComplex) {
+      st.topView = makeTopView(numRows);
+      sizeSinks.resize(numRows);
+    }
+
+    for (size_t k = 0; k < fieldCount; ++k) {
+      const auto& childType = rowType.childAt(k);
+      if (childType->isPrimitiveType()) {
+        st.decodedOrPlans[k].emplace<DecodedVector>();
+        auto* decoded = std::get_if<DecodedVector>(&st.decodedOrPlans[k]);
+
+        decoded->decode(*rowVector->childAt(k));
+        scalar::addColumnSizes(
+            *childType, *decoded, numRows, st.rowSizes.data());
+      } else {
+        st.decodedOrPlans[k].emplace<ColumnPlan>(
+            buildPlan(childType, rowVector->childAt(k)));
+        auto* plan = std::get_if<ColumnPlan>(&st.decodedOrPlans[k]);
+        encodeColumnBatch<SizeSink>(
+            *childType,
+            *plan,
+            st.topView.view(),
+            folly::Range<SizeSink*>(sizeSinks.data(), numRows),
+            /*rowNulls=*/nullptr);
+      }
+    }
+    if (anyComplex) {
+      for (vector_size_t r = 0; r < numRows; ++r) {
+        st.rowSizes[r] += sizeSinks[r].bytes;
+      }
+    }
+  }
+
+  size_t total = 0;
+  for (size_t s : st.rowSizes) {
+    total += s;
+  }
+  st.totalSize = total;
+}
+
+DenseRow::DenseRow(DenseRow&&) noexcept = default;
+DenseRow& DenseRow::operator=(DenseRow&&) noexcept = default;
+DenseRow::~DenseRow() = default;
+
+vector_size_t DenseRow::numRows() const {
+  return state_->numRows;
+}
+
+const std::vector<size_t>& DenseRow::rowSizes() const {
+  return state_->rowSizes;
+}
+
+size_t DenseRow::rowSizeAt(vector_size_t index) const {
+  return state_->rowSizes[index];
+}
+
+size_t DenseRow::totalSize() const {
+  return state_->totalSize;
+}
+
+void DenseRow::serialize(uint8_t* base, folly::Range<const size_t*> offsets)
+    const {
+  const auto numRows = state_->numRows;
+  BOLT_USER_CHECK_EQ(
+      offsets.size(),
+      static_cast<size_t>(numRows),
+      "DenseRow::serialize offsets size mismatch");
+  if (numRows == 0) {
+    return;
+  }
+  const auto& rowType = state_->rowVector->type()->asRow();
+  const auto fieldCount = rowType.size();
+
+  // Write fields in declaration order, sharing one per-row write cursor so each
+  // field lands at the right offset. Scalar fields advance the cursor directly;
+  // complex fields run the general WriteSink pass (replaying the cached slot
+  // tree), syncing the cursor across the call. writeSinks is allocated only if
+  // a complex field is present (an all-scalar row never touches it).
+  std::vector<uint8_t*> cursors(numRows);
+  for (vector_size_t r = 0; r < numRows; ++r) {
+    cursors[r] = base + offsets[r];
+  }
+  std::vector<WriteSink> writeSinks;
+  for (size_t k = 0; k < fieldCount; ++k) {
+    const auto& childType = rowType.childAt(k);
+    std::visit(
+        [&](auto& decodedOrPlan) {
+          using T = std::decay_t<decltype(decodedOrPlan)>;
+          if constexpr (std::is_same_v<T, DecodedVector>) {
+            BOLT_CHECK(childType->isPrimitiveType());
+            scalar::writeColumn(
+                *childType, decodedOrPlan, numRows, cursors.data());
+          } else {
+            static_assert(std::is_same_v<T, ColumnPlan>);
+            if (writeSinks.empty()) {
+              writeSinks.resize(numRows);
+            }
+            for (vector_size_t r = 0; r < numRows; ++r) {
+              writeSinks[r].out = cursors[r];
+            }
+            encodeColumnBatch<WriteSink>(
+                *childType,
+                decodedOrPlan,
+                state_->topView.view(),
+                folly::Range<WriteSink*>(writeSinks.data(), numRows),
+                /*rowNulls=*/nullptr);
+            for (vector_size_t r = 0; r < numRows; ++r) {
+              cursors[r] = writeSinks[r].out;
+            }
+          }
+        },
+        state_->decodedOrPlans[k]);
+  }
+
+  for (vector_size_t r = 0; r < numRows; ++r) {
+    const auto* rowStart = base + offsets[r];
+    const auto actualSize = static_cast<size_t>(cursors[r] - rowStart);
+    const auto expectedSize = state_->rowSizes[r];
+    BOLT_CHECK_EQ(
+        actualSize,
+        expectedSize,
+        "DenseRow::serialize row size mismatch at row {}, offset {}",
+        r,
+        offsets[r]);
+  }
+}
+
+RowVectorPtr DenseRow::deserialize(
+    const std::vector<std::string_view>& data,
+    const RowTypePtr& rowType,
+    memory::MemoryPool* pool) {
+  const auto rowCount = static_cast<vector_size_t>(data.size());
+
+  // Decode fields in declaration order, sharing one per-row read cursor
+  // (marker-less, so no top-level nulls). Scalar fields read column-at-a-time;
+  // complex fields run the general decode over the top slot view, which is
+  // built only when a complex field is present. Mirrors the per-column
+  // serialize path.
+  auto out = BaseVector::create(rowType, rowCount, pool);
+  auto* rowVec = out->asUnchecked<RowVector>();
+  std::vector<RowCursor> cursors(rowCount);
+  for (vector_size_t r = 0; r < rowCount; ++r) {
+    cursors[r].cur = reinterpret_cast<const uint8_t*>(data[r].data());
+    cursors[r].end = cursors[r].cur + data[r].size();
+    rowVec->setNull(r, false);
+  }
+
+  const auto cursorRange = folly::Range<RowCursor*>(cursors.data(), rowCount);
+  const auto fieldCount = rowType->size();
+  bool anyComplex = false;
+  for (size_t k = 0; k < fieldCount; ++k) {
+    if (!rowType->childAt(k)->isPrimitiveType()) {
+      anyComplex = true;
+      break;
+    }
+  }
+  TopSlotView top;
+  if (anyComplex) {
+    top = makeTopView(rowCount);
+  }
+  for (size_t k = 0; k < fieldCount; ++k) {
+    const auto& childType = rowType->childAt(k);
+    if (childType->isPrimitiveType()) {
+      scalar::readColumn(
+          *childType, *rowVec->childAt(k), rowCount, cursorRange);
+    } else {
+      decodeColumnBatch(
+          *childType,
+          *rowVec->childAt(k),
+          top.view(),
+          cursorRange,
+          /*rowNulls=*/nullptr);
+    }
+  }
+
+  for (vector_size_t r = 0; r < rowCount; ++r) {
+    BOLT_USER_CHECK(
+        cursors[r].cur == cursors[r].end,
+        "DenseRow: row {} not fully consumed ({} bytes remaining)",
+        r,
+        cursors[r].end - cursors[r].cur);
+  }
+  return std::dynamic_pointer_cast<RowVector>(out);
+}
+
+} // namespace bytedance::bolt::row
diff --git a/bolt/row/dense/DenseRow.h b/bolt/row/dense/DenseRow.h
new file mode 100644
index 000000000..b169db3b8
--- /dev/null
+++ b/bolt/row/dense/DenseRow.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include <folly/Range.h>
+
+#include "bolt/vector/BaseVector.h"
+#include "bolt/vector/ComplexVector.h"
+
+// Dense row serializer — sibling to CompactRow / UnsafeRowFast, but
+// column-batched (processes all rows at once) rather than row-at-a-time, so it
+// exposes only batch operations (no single-row rowSize(i) / serialize(i)).
+//
+// The wire is the "dense", no-waste layout
+// 1. variable-length (varint) values
+// 2. nulls fused into the structure bytes (no null bitmap)
+// 3. no alignment padding,
+// 4. level-hoisted nesting.
+// The grammar is documented at the top of DenseRow.cpp.
+//
+// Usage (mirrors CompactRow):
+//   DenseRow rows(rowVector);                 // builds plan + sizes (once)
+//   auto offsets = to_offsets(rows.rowSizes());
+//   rows.serialize(base, offsets);            // write all rows at the offsets
+//
+//   auto rv = DenseRow::deserialize(ranges, rowType, pool);
+namespace bytedance::bolt::row {
+
+class DenseRow {
+ public:
+  explicit DenseRow(const RowVectorPtr& vector);
+  DenseRow(DenseRow&&) noexcept;
+  DenseRow& operator=(DenseRow&&) noexcept;
+  ~DenseRow();
+
+  vector_size_t numRows() const;
+
+  size_t rowSizeAt(vector_size_t index) const;
+
+  // Per-row encoded byte counts (all rows). DenseRow precomputes these in its
+  // size pass, so this bulk accessor is free; rowSizeAt() indexes into it.
+  const std::vector<size_t>& rowSizes() const;
+
+  // Sum of rowSizes()
+  size_t totalSize() const;
+
+  // Serialize every row into `base + offsets[r]`. `offsets.size()` must equal
+  // numRows(); row r writes exactly rowSizes()[r] bytes.
+  void serialize(uint8_t* base, folly::Range<const size_t*> offsets) const;
+
+  // Reconstruct a flat RowVector of `rowType` from pre-split per-row byte
+  // ranges (one entry per row). Inverse of serialize().
+  static RowVectorPtr deserialize(
+      const std::vector<std::string_view>& data,
+      const RowTypePtr& rowType,
+      memory::MemoryPool* pool);
+
+ private:
+  struct State;
+  std::unique_ptr<State> state_;
+};
+
+} // namespace bytedance::bolt::row
diff --git a/bolt/row/dense/DenseRowGeneral.h b/bolt/row/dense/DenseRowGeneral.h
new file mode 100644
index 000000000..3ae37d78d
--- /dev/null
+++ b/bolt/row/dense/DenseRowGeneral.h
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include <folly/CPortability.h>
+#include <folly/Range.h>
+#include <folly/small_vector.h>
+
+#include "bolt/common/base/Nulls.h"
+#include "bolt/row/dense/IntVarint.h"
+#include "bolt/vector/BaseVector.h"
+#include "bolt/vector/DecodedVector.h"
+
+// Internal declarations for the GENERAL (non-flat) dense-row codec: the
+// column-batch machinery that handles arbitrary nesting (ARRAY/MAP/ROW) and
+// dictionary/constant inputs. Shared between its two implementation TUs —
+// DenseRowGeneralEncode.cpp and DenseRowGeneralDecode.cpp — and included by the
+// DenseRow public API layer (DenseRow.cpp) and the scalar fast path
+// (DenseRowScalar*.cpp) for the common varint/slot helpers below.
+//
+// Encode and decode live in separate TUs so their (cache-alignment-sensitive)
+// machine-code layout is not perturbed by unrelated edits to the other side —
+// see the note on intra-TU layout sensitivity in DenseRow.cpp.
+namespace bytedance::bolt::row::dense_row {
+
+// The BMI2 fast path is selected at compile time inside IntVarint.h (gated by
+// the x86_64 `#if`), so these are just the detail helpers under this namespace.
+using detail::readNullableInt128;
+using detail::readNullableInt64;
+using detail::readVarint;
+using detail::writeNullableInt128;
+using detail::writeNullableInt64;
+using detail::writeVarint;
+
+// Null sentinels for REAL/DOUBLE: a non-null value whose raw bits collide with
+// the sentinel is bit-flipped on encode and restored on decode.
+constexpr uint32_t kNullFloatBits{0x7fc00000U};
+constexpr uint64_t kNullDoubleBits{0x7ff8000000000000ULL};
+
+// =============================================================================
+// Shared slot machinery for column-batch encode/decode.
+// =============================================================================
+//
+// Each call processes N source rows. At every recursion level, each source row
+// contributes zero or more contiguous slot ranges in the level's vector.
+// SlotView.slots is a flat array of (base, count) ranges; the entries for
+// source row r occupy slots[rowBoundaries[r]..rowBoundaries[r+1]).
+//
+// POSITION SPACE: slot positions index the CURRENT level's vector — for a
+// nested ARRAY/MAP level that is the child elements vector's own position
+// space (built from the parent's rawOffsets/rawSizes), NOT the parent's. The
+// top-level SlotView indexes the top vector ({r, 1} per row). `parentNulls`, if
+// set, is indexed by these same current-level positions. A leaf encoder maps a
+// position `p` to the decoded value via `plan.decoded` (identity, or
+// `decoded.index(p)` for dictionary/constant inputs) — so `p` is the decoded
+// vector's position, and rawOffsets/rawSizes for ARRAY/MAP are read at the
+// decoded index.
+//
+// Multiple ranges per row are necessary because ArrayVector/MapVector input can
+// have non-contiguous child layouts (gaps between adjacent parent slots), so
+// each non-null parent slot contributes its own child range. Decoded output
+// vectors are always packed contiguously, so on decode each source row's child
+// ranges may happen to be back-to-back, but the representation stays uniform.
+struct SlotRange {
+  uint32_t base;
+  uint32_t count;
+};
+
+struct SlotView {
+  folly::Range<const SlotRange*> slots;
+  folly::Range<const uint32_t*> rowBoundaries; // size N+1
+  // Per-position null bitmap inherited from ancestor ROWs. Indexed by the
+  // current level's vector positions. nullptr means no filter.
+  const uint64_t* parentNulls = nullptr;
+};
+
+struct RowCursor {
+  const uint8_t* cur;
+  const uint8_t* end;
+};
+
+// Iterate over a single source row's live positions. Walks every slot range
+// belonging to row r and every position inside it, skipping positions covered
+// by parentNulls.
+template <typename F>
+FOLLY_ALWAYS_INLINE void forEachLivePos(SlotView v, vector_size_t r, F f) {
+  const uint64_t* nulls = v.parentNulls;
+  const auto* slots = v.slots.data();
+  const auto lo = v.rowBoundaries[r];
+  const auto hi = v.rowBoundaries[r + 1];
+  if (!nulls) {
+    for (uint32_t i = lo; i < hi; ++i) {
+      const auto& sr = slots[i];
+      const uint32_t end = sr.base + sr.count;
+      for (uint32_t p = sr.base; p < end; ++p) {
+        f(p);
+      }
+    }
+  } else {
+    for (uint32_t i = lo; i < hi; ++i) {
+      const auto& sr = slots[i];
+      const uint32_t end = sr.base + sr.count;
+      for (uint32_t p = sr.base; p < end; ++p) {
+        if (bits::isBitNull(nulls, static_cast<int32_t>(p))) {
+          continue;
+        }
+        f(p);
+      }
+    }
+  }
+}
+
+// Top-level SlotView covering every position [0, rowCount): one {r, 1} slot per
+// source row. Used by both serialize and deserialize entry points.
+struct TopSlotView {
+  std::vector<SlotRange> slots;
+  std::vector<uint32_t> boundaries;
+
+  SlotView view() {
+    return SlotView{
+        {slots.data(), slots.size()},
+        {boundaries.data(), boundaries.size()},
+        nullptr};
+  }
+};
+
+// TODO delete TopSlotView
+inline TopSlotView makeTopView(vector_size_t rowCount) {
+  TopSlotView tv;
+  tv.slots.resize(rowCount);
+  tv.boundaries.resize(rowCount + 1);
+  for (vector_size_t r = 0; r < rowCount; ++r) {
+    tv.slots[r] = {static_cast<uint32_t>(r), 1u};
+    tv.boundaries[r] = static_cast<uint32_t>(r);
+  }
+  tv.boundaries[rowCount] = static_cast<uint32_t>(rowCount);
+  return tv;
+}
+
+// =============================================================================
+// ENCODE — column-batch encode kernels (DenseRowGeneralEncode.cpp).
+// =============================================================================
+
+// Backing storage for one nested ARRAY/MAP level's child SlotView: the child
+// slot ranges + per-source-row boundaries. Built by the SizeSink pass and read
+// back by the WriteSink pass (see ColumnPlan::childSlots).
+struct SlotTreeNode {
+  folly::small_vector<SlotRange, 32> slots;
+  folly::small_vector<uint32_t, 16> boundaries;
+};
+
+// One node of the per-row encode plan: a column at one nesting level. Caches
+// that column's DecodedVector (so reads see through dictionary/constant
+// wrapping) and, for nested types, its child plans. A flat tagged struct (no
+// inheritance): `kind` selects how `children` is interpreted —
+//   ARRAY -> {elements}, MAP -> {keys, values}, ROW -> fields, scalar -> {}.
+// `buildPlan` produces the tree once; both the size and write passes reuse it.
+// The concrete ArrayVector/MapVector/RowVector base is recovered at the use
+// site via decoded.base()->as<...>().
+struct ColumnPlan {
+  TypeKind kind{TypeKind::UNKNOWN};
+  DecodedVector decoded;
+  bool mayHaveNulls{false};
+  bool isNullColumn{false};
+  std::vector<ColumnPlan> children;
+  // The vector this node's `decoded` reads, held so the node is self-contained:
+  // both `decoded` and the buffers it points into stay valid for the node's
+  // lifetime, independent of who else references the input. Null only for an
+  // all-null (missing) ROW child.
+  VectorPtr source;
+  // ARRAY/MAP only: the child SlotView's storage, built into the plan tree by
+  // the SizeSink pass and replayed by the WriteSink pass — so each level reads
+  // its own slots straight off the tree (no shared cursor/scratch between
+  // passes). `mutable` because the (size) pass fills it through a `const`
+  // ColumnPlan&; it is a derived cache, not part of the plan's identity.
+  mutable SlotTreeNode childSlots;
+};
+
+ColumnPlan buildPlan(const TypePtr& type, const VectorPtr& vector);
+
+// A "sink" abstracts the size pass vs the write pass: encodeColumnBatch is
+// templated on it so both passes share one implementation (byte counts and
+// bytes-written cannot drift). SizeSink accumulates a byte count; WriteSink
+// writes bytes through a moving cursor.
+//
+// The size pass does not naively walk every value: fixed-width leaves
+// (BOOLEAN/REAL/DOUBLE) add count*width analytically, and integer leaves use
+// the SIMD-batched sumNullableIntSizes; only variable-length leaves (VARCHAR)
+// are walked per value, which is unavoidable. The SizeSink pass also builds the
+// slot tree that the write pass reuses for nested ARRAY/MAP, so it is not just
+// a size computation that could be replaced by a closed-form estimate.
+struct SizeSink {
+  size_t bytes{0};
+
+  FOLLY_ALWAYS_INLINE void putVarint(uint64_t v) {
+    bytes += detail::varintSize(v);
+  }
+  FOLLY_ALWAYS_INLINE void putNullableInt64(int64_t v, bool isNull) {
+    bytes += detail::nullableInt64SerializedSize(v, isNull);
+  }
+  FOLLY_ALWAYS_INLINE void putRaw(const void* /*p*/, size_t n) {
+    bytes += n;
+  }
+  template <typename T>
+  FOLLY_ALWAYS_INLINE void putFixed(const T& /*v*/) {
+    bytes += sizeof(T);
+  }
+};
+
+struct WriteSink {
+  uint8_t* out{nullptr};
+
+  FOLLY_ALWAYS_INLINE void putVarint(uint64_t v) {
+    out = writeVarint(v, out);
+  }
+  FOLLY_ALWAYS_INLINE void putNullableInt64(int64_t v, bool isNull) {
+    out = writeNullableInt64(v, isNull, out);
+  }
+  FOLLY_ALWAYS_INLINE void putRaw(const void* p, size_t n) {
+    std::memcpy(out, p, n);
+    out += n;
+  }
+  template <typename T>
+  FOLLY_ALWAYS_INLINE void putFixed(const T& v) {
+    std::memcpy(out, &v, sizeof(T));
+    out += sizeof(T);
+  }
+};
+
+// Encode one column (any type) for N source rows into the per-row sinks. The
+// SizeSink pass fills each ARRAY/MAP node's ColumnPlan::childSlots; the
+// WriteSink pass reads them back. Instantiated for SizeSink and WriteSink in
+// DenseRowGeneralEncode.cpp.
+template <typename Sink>
+void encodeColumnBatch(
+    const Type& type,
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls);
+
+// Encode a ROW level directly (entry point for the marker-less serializer).
+// emitMarker=false omits the per-position present/null marker (caller asserts
+// no nulls at this level).
+template <typename Sink>
+void encodeRowBatch(
+    const Type& type,
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls,
+    bool emitMarker = true);
+
+// =============================================================================
+// DECODE — column-batch decode kernels (DenseRowGeneralDecode.cpp).
+// =============================================================================
+
+// Decode entry points. Both are mutually recursive across the type dispatch.
+// `readMarker == false` is the marker-less shuffle contract (caller asserts
+// every top-level row is non-null).
+void decodeColumnBatch(
+    const Type& type,
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls);
+
+void decodeRowBatch(
+    const Type& type,
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls,
+    bool readMarker = true);
+
+} // namespace bytedance::bolt::row::dense_row
diff --git a/bolt/row/dense/DenseRowGeneralDecode.cpp b/bolt/row/dense/DenseRowGeneralDecode.cpp
new file mode 100644
index 000000000..24bec611d
--- /dev/null
+++ b/bolt/row/dense/DenseRowGeneralDecode.cpp
@@ -0,0 +1,638 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Decode (deserialize) kernels for the level-hoisted dense row format.
+//
+// These nested-container decode loops are pathologically sensitive to machine-
+// code layout: on byte-identical source, small shifts in surrounding code swing
+// individual cases (nested ARRAY/MAP/long-string deserialize) by ~10-14% purely
+// from cache-line / branch-predictor aliasing — see the intra-TU layout note in
+// DenseRow.cpp. They live in their own translation unit so that unrelated
+// edits to the encode/serializer code no longer re-roll that layout lottery;
+// the decode layout is now determined solely by this file. Do not merge these
+// kernels back into another TU, and re-run dense_row_serialize_benchmark
+// (dense_deserialize_*) after any change here.
+
+#include "bolt/row/dense/DenseRowGeneral.h"
+
+#include <cstring>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include <folly/small_vector.h>
+
+#include "bolt/vector/ComplexVector.h"
+#include "bolt/vector/FlatVector.h"
+
+namespace bytedance::bolt::row::dense_row {
+
+// =============================================================================
+// Decode side
+// =============================================================================
+
+template <typename T>
+void decodeIntegerBatch(
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* flat = dst.asUnchecked<FlatVector<T>>();
+  auto* raw = flat->mutableRawValues();
+  const auto N = static_cast<vector_size_t>(cursors.size());
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    forEachLivePos(out, r, [&](uint32_t p) {
+      bool isNull{false};
+      int64_t v{0};
+      BOLT_USER_CHECK(
+          readNullableInt64(c.cur, c.end, isNull, v),
+          "DenseRow: malformed integer value at row {}",
+          r);
+      if (isNull) {
+        flat->setNull(static_cast<vector_size_t>(p), true);
+      } else {
+        if constexpr (!std::is_same_v<T, int64_t>) {
+          BOLT_USER_CHECK(
+              v >= static_cast<int64_t>(std::numeric_limits<T>::min()) &&
+                  v <= static_cast<int64_t>(std::numeric_limits<T>::max()),
+              "DenseRow: integer value out of range at row {}: {}",
+              r,
+              v);
+        }
+        flat->setNull(static_cast<vector_size_t>(p), false);
+        raw[p] = static_cast<T>(v);
+      }
+    });
+  }
+}
+
+void decodeBooleanBatch(
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* flat = dst.asUnchecked<FlatVector<bool>>();
+  const auto N = static_cast<vector_size_t>(cursors.size());
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    forEachLivePos(out, r, [&](uint32_t p) {
+      uint64_t v{0};
+      BOLT_USER_CHECK(
+          readVarint(c.cur, c.end, v),
+          "DenseRow: malformed boolean at row {}",
+          r);
+      if (v == 0) {
+        flat->setNull(static_cast<vector_size_t>(p), true);
+      } else {
+        BOLT_USER_CHECK(
+            v <= 2, "DenseRow: invalid boolean encoding at row {}: {}", r, v);
+        flat->setNull(static_cast<vector_size_t>(p), false);
+        flat->set(static_cast<vector_size_t>(p), v == 2);
+      }
+    });
+  }
+}
+
+void decodeRealBatch(
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* flat = dst.asUnchecked<FlatVector<float>>();
+  auto* raw = flat->mutableRawValues();
+  const auto N = static_cast<vector_size_t>(cursors.size());
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    forEachLivePos(out, r, [&](uint32_t p) {
+      BOLT_USER_CHECK(
+          static_cast<size_t>(c.end - c.cur) >= sizeof(uint32_t),
+          "DenseRow: truncated real at row {}",
+          r);
+      uint32_t b;
+      std::memcpy(&b, c.cur, sizeof(b));
+      c.cur += sizeof(b);
+      if (b == kNullFloatBits) {
+        flat->setNull(static_cast<vector_size_t>(p), true);
+      } else {
+        flat->setNull(static_cast<vector_size_t>(p), false);
+        if (FOLLY_UNLIKELY(b == (kNullFloatBits ^ 1u))) {
+          b ^= 1u;
+        }
+        std::memcpy(raw + p, &b, sizeof(b));
+      }
+    });
+  }
+}
+
+void decodeDoubleBatch(
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* flat = dst.asUnchecked<FlatVector<double>>();
+  auto* raw = flat->mutableRawValues();
+  const auto N = static_cast<vector_size_t>(cursors.size());
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    forEachLivePos(out, r, [&](uint32_t p) {
+      BOLT_USER_CHECK(
+          static_cast<size_t>(c.end - c.cur) >= sizeof(uint64_t),
+          "DenseRow: truncated double at row {}",
+          r);
+      uint64_t b;
+      std::memcpy(&b, c.cur, sizeof(b));
+      c.cur += sizeof(b);
+      if (b == kNullDoubleBits) {
+        flat->setNull(static_cast<vector_size_t>(p), true);
+      } else {
+        flat->setNull(static_cast<vector_size_t>(p), false);
+        if (FOLLY_UNLIKELY(b == (kNullDoubleBits ^ 1ull))) {
+          b ^= 1ull;
+        }
+        std::memcpy(raw + p, &b, sizeof(b));
+      }
+    });
+  }
+}
+
+// Mirror of encodeHugeintBatch: nullableInt64(low 64 of zigzag128) carrying the
+// null marker, then (if non-null) varint(high 64). See
+// detail::writeNullableInt128.
+void decodeHugeintBatch(
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* flat = dst.asUnchecked<FlatVector<int128_t>>();
+  auto* raw = flat->mutableRawValues();
+  const auto N = static_cast<vector_size_t>(cursors.size());
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    forEachLivePos(out, r, [&](uint32_t p) {
+      bool isNull{false};
+      int128_t v{0};
+      BOLT_USER_CHECK(
+          readNullableInt128(c.cur, c.end, isNull, v),
+          "DenseRow: malformed hugeint at row {}",
+          r);
+      flat->setNull(static_cast<vector_size_t>(p), isNull);
+      if (!isNull) {
+        raw[p] = v;
+      }
+    });
+  }
+}
+
+void decodeVarcharBatch(
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* flat = dst.asUnchecked<FlatVector<StringView>>();
+  // Write StringViews via raw pointer to avoid flat->set()'s extra memcpy
+  // for non-inline strings (the payload already lives in our buffer).
+  auto* rawValues = flat->mutableRawValues();
+  char* buf{nullptr};
+  size_t bufRemaining = 0;
+  const auto N = static_cast<vector_size_t>(cursors.size());
+
+  // Wire layout per row segment: length stream then payload stream.
+  // Decode must mirror that split. Stash per-slot length (or -1 for null)
+  // so the payload pass can do the placement.
+  folly::small_vector<int32_t, 32> lengths;
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    lengths.clear();
+    forEachLivePos(out, r, [&](uint32_t /*p*/) {
+      uint64_t v{0};
+      BOLT_USER_CHECK(
+          readVarint(c.cur, c.end, v),
+          "DenseRow: malformed varchar length at row {}",
+          r);
+      lengths.push_back(v == 0 ? -1 : static_cast<int32_t>(v - 1));
+    });
+
+    size_t idx = 0;
+    forEachLivePos(out, r, [&](uint32_t p) {
+      const int32_t len = lengths[idx++];
+      if (len < 0) {
+        flat->setNull(static_cast<vector_size_t>(p), true);
+        return;
+      }
+      const auto ulen = static_cast<size_t>(len);
+      BOLT_USER_CHECK(
+          static_cast<size_t>(c.end - c.cur) >= ulen,
+          "DenseRow: truncated varchar payload at row {}",
+          r);
+      if (ulen <= StringView::kInlineSize) {
+        rawValues[p] = StringView(reinterpret_cast<const char*>(c.cur), ulen);
+      } else {
+        if (bufRemaining < ulen) {
+          // Upper bound: bytes remaining in this row + every later non-null
+          // row's bytes.
+          size_t needed = static_cast<size_t>(c.end - c.cur);
+          for (vector_size_t j = r + 1; j < N; ++j) {
+            if (rowNulls && bits::isBitNull(rowNulls, j)) {
+              continue;
+            }
+            needed += static_cast<size_t>(cursors[j].end - cursors[j].cur);
+          }
+          buf = flat->getRawStringBufferWithSpace(needed, true);
+          bufRemaining = needed;
+        }
+        std::memcpy(buf, c.cur, ulen);
+        rawValues[p] = StringView(buf, ulen);
+        buf += ulen;
+        bufRemaining -= ulen;
+      }
+      c.cur += ulen;
+    });
+  }
+}
+
+void decodeTimestampBatch(
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* flat = dst.asUnchecked<FlatVector<Timestamp>>();
+  const auto N = static_cast<vector_size_t>(cursors.size());
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    forEachLivePos(out, r, [&](uint32_t p) {
+      bool isNull{false};
+      int64_t micros{0};
+      BOLT_USER_CHECK(
+          readNullableInt64(c.cur, c.end, isNull, micros),
+          "DenseRow: malformed timestamp at row {}",
+          r);
+      if (isNull) {
+        flat->setNull(static_cast<vector_size_t>(p), true);
+      } else {
+        flat->setNull(static_cast<vector_size_t>(p), false);
+        flat->set(
+            static_cast<vector_size_t>(p),
+            Timestamp::fromMicrosNoError(micros));
+      }
+    });
+  }
+}
+
+void decodeNullColumnBatch(
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(cursors.size());
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    forEachLivePos(out, r, [&](uint32_t p) {
+      uint64_t v{0};
+      BOLT_USER_CHECK(
+          readVarint(c.cur, c.end, v),
+          "DenseRow: malformed unknown-type marker at row {}",
+          r);
+      BOLT_USER_CHECK(
+          v == 0, "DenseRow: unknown-type expected null marker at row {}", r);
+      dst.setNull(static_cast<vector_size_t>(p), true);
+    });
+  }
+}
+
+// Pass 1 for ARRAY/MAP decode: read cardinality varints in row/parent-slot
+// order. Allocates one (off, sz) child-slot entry per parent slot — null
+// parent slots get (0, 0) entries as placeholders for index alignment with
+// the parent-slot iteration. We also write the parent ArrayVector/MapVector
+// offsets/sizes/nulls at this stage since the layout is already known.
+// Callback `assign(pos, isNull, off, sz)` is passed as a template parameter so
+// it inlines (no std::function indirect call per element in the decode hot
+// loop).
+template <typename Assign>
+void decodeArrayLikePass1(
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls,
+    vector_size_t childBase,
+    folly::small_vector<SlotRange, 32>& childSlots,
+    folly::small_vector<uint32_t, 16>& childBoundaries,
+    vector_size_t& totalChildren,
+    const char* what,
+    Assign assign) {
+  const auto N = static_cast<vector_size_t>(cursors.size());
+  childBoundaries.resize(N + 1);
+  childBoundaries[0] = 0;
+  vector_size_t writeHead = childBase;
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      childBoundaries[r + 1] = static_cast<uint32_t>(childSlots.size());
+      continue;
+    }
+    RowCursor& c = cursors[r];
+    forEachLivePos(out, r, [&](uint32_t p) {
+      uint64_t e{0};
+      BOLT_USER_CHECK(
+          readVarint(c.cur, c.end, e),
+          "DenseRow: malformed {} cardinality at row {}",
+          what,
+          r);
+      if (e == 0) {
+        assign(static_cast<vector_size_t>(p), /*isNull=*/true, 0, 0);
+      } else {
+        // Bound the cardinality before the (narrowing) cast: each element
+        // consumes >= 1 byte further in this row's blob, so a cardinality
+        // larger than the bytes remaining for this row is corrupt input.
+        // Guards against overflowing writeHead/totalChildren and the
+        // subsequent child-vector resize on malformed wire.
+        const uint64_t card = e - 1;
+        BOLT_USER_CHECK_LE(
+            card,
+            static_cast<uint64_t>(c.end - c.cur),
+            "DenseRow: {} cardinality {} exceeds remaining bytes at row {}",
+            what,
+            card,
+            r);
+        const auto sz = static_cast<vector_size_t>(card);
+        assign(static_cast<vector_size_t>(p), /*isNull=*/false, writeHead, sz);
+        if (sz > 0) {
+          childSlots.push_back(
+              {static_cast<uint32_t>(writeHead), static_cast<uint32_t>(sz)});
+        }
+        writeHead += sz;
+      }
+    });
+    childBoundaries[r + 1] = static_cast<uint32_t>(childSlots.size());
+  }
+  totalChildren = writeHead - childBase;
+}
+
+void decodeArrayBatch(
+    const Type& type,
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* arr = dst.asUnchecked<ArrayVector>();
+  auto& elements = *arr->elements();
+  const vector_size_t childBase = elements.size();
+
+  folly::small_vector<SlotRange, 32> childSlots;
+  folly::small_vector<uint32_t, 16> childBoundaries;
+  vector_size_t totalChildren = 0;
+  decodeArrayLikePass1(
+      out,
+      cursors,
+      rowNulls,
+      childBase,
+      childSlots,
+      childBoundaries,
+      totalChildren,
+      "array",
+      [&](vector_size_t pos, bool isNull, vector_size_t off, vector_size_t sz) {
+        arr->setNull(pos, isNull);
+        arr->setOffsetAndSize(pos, off, sz);
+      });
+
+  elements.resize(childBase + totalChildren);
+
+  SlotView childView{
+      {childSlots.data(), childSlots.size()},
+      {childBoundaries.data(), childBoundaries.size()},
+      nullptr};
+  decodeColumnBatch(*type.childAt(0), elements, childView, cursors, rowNulls);
+}
+
+void decodeMapBatch(
+    const Type& type,
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  auto* m = dst.asUnchecked<MapVector>();
+  auto& keys = *m->mapKeys();
+  auto& values = *m->mapValues();
+  const vector_size_t childBase = keys.size();
+
+  folly::small_vector<SlotRange, 32> childSlots;
+  folly::small_vector<uint32_t, 16> childBoundaries;
+  vector_size_t totalChildren = 0;
+  decodeArrayLikePass1(
+      out,
+      cursors,
+      rowNulls,
+      childBase,
+      childSlots,
+      childBoundaries,
+      totalChildren,
+      "map",
+      [&](vector_size_t pos, bool isNull, vector_size_t off, vector_size_t sz) {
+        m->setNull(pos, isNull);
+        m->setOffsetAndSize(pos, off, sz);
+      });
+
+  keys.resize(childBase + totalChildren);
+  values.resize(childBase + totalChildren);
+
+  SlotView childView{
+      {childSlots.data(), childSlots.size()},
+      {childBoundaries.data(), childBoundaries.size()},
+      nullptr};
+  decodeColumnBatch(*type.childAt(0), keys, childView, cursors, rowNulls);
+  decodeColumnBatch(*type.childAt(1), values, childView, cursors, rowNulls);
+}
+
+void decodeRowBatch(
+    const Type& type,
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls,
+    bool readMarker) {
+  auto* row = dst.asUnchecked<RowVector>();
+  const auto N = static_cast<vector_size_t>(cursors.size());
+
+  vector_size_t bitmapBits = 0;
+  for (const auto& sr : out.slots) {
+    const auto endPos = static_cast<vector_size_t>(sr.base + sr.count);
+    if (endPos > bitmapBits) {
+      bitmapBits = endPos;
+    }
+  }
+
+  std::vector<uint64_t> childNullsBuf;
+  const uint64_t* childParentNulls = out.parentNulls;
+  if (bitmapBits > 0) {
+    childNullsBuf.assign(bits::nwords(bitmapBits), ~uint64_t{0});
+    childParentNulls = childNullsBuf.data();
+  }
+
+  if (readMarker) {
+    for (vector_size_t r = 0; r < N; ++r) {
+      if (rowNulls && bits::isBitNull(rowNulls, r)) {
+        continue;
+      }
+      RowCursor& c = cursors[r];
+      const auto lo = out.rowBoundaries[r];
+      const auto hi = out.rowBoundaries[r + 1];
+      for (uint32_t i = lo; i < hi; ++i) {
+        const auto& sr = out.slots[i];
+        const uint32_t endPos = sr.base + sr.count;
+        for (uint32_t p = sr.base; p < endPos; ++p) {
+          const bool parentSaysNull = out.parentNulls &&
+              bits::isBitNull(out.parentNulls, static_cast<int32_t>(p));
+          if (parentSaysNull) {
+            if (!childNullsBuf.empty()) {
+              bits::setNull(childNullsBuf.data(), p, true);
+            }
+            continue;
+          }
+          uint64_t v{0};
+          BOLT_USER_CHECK(
+              readVarint(c.cur, c.end, v),
+              "DenseRow: malformed row null marker at row {}",
+              r);
+          if (v == 0) {
+            row->setNull(static_cast<vector_size_t>(p), true);
+            if (!childNullsBuf.empty()) {
+              bits::setNull(childNullsBuf.data(), p, true);
+            }
+          } else {
+            BOLT_USER_CHECK(
+                v == 1,
+                "DenseRow: invalid row null marker at row {}: {}",
+                r,
+                v);
+            row->setNull(static_cast<vector_size_t>(p), false);
+          }
+        }
+      }
+    }
+  } else {
+    // No marker on wire — caller asserts every position is non-null.
+    // Mirror the null-tracking the marker pass would have done for a
+    // non-null row so descended children see consistent parentNulls.
+    for (vector_size_t r = 0; r < N; ++r) {
+      if (rowNulls && bits::isBitNull(rowNulls, r)) {
+        continue;
+      }
+      const auto lo = out.rowBoundaries[r];
+      const auto hi = out.rowBoundaries[r + 1];
+      for (uint32_t i = lo; i < hi; ++i) {
+        const auto& sr = out.slots[i];
+        const uint32_t endPos = sr.base + sr.count;
+        for (uint32_t p = sr.base; p < endPos; ++p) {
+          const bool parentSaysNull = out.parentNulls &&
+              bits::isBitNull(out.parentNulls, static_cast<int32_t>(p));
+          if (parentSaysNull) {
+            if (!childNullsBuf.empty()) {
+              bits::setNull(childNullsBuf.data(), p, true);
+            }
+            continue;
+          }
+          row->setNull(static_cast<vector_size_t>(p), false);
+        }
+      }
+    }
+  }
+
+  SlotView childView{out.slots, out.rowBoundaries, childParentNulls};
+  const auto fieldCount = type.size();
+  for (size_t f = 0; f < fieldCount; ++f) {
+    decodeColumnBatch(
+        *type.childAt(f), *row->childAt(f), childView, cursors, rowNulls);
+  }
+}
+
+void decodeColumnBatch(
+    const Type& type,
+    BaseVector& dst,
+    SlotView out,
+    folly::Range<RowCursor*> cursors,
+    const uint64_t* rowNulls) {
+  switch (type.kind()) {
+    case TypeKind::BOOLEAN:
+      decodeBooleanBatch(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::TINYINT:
+      decodeIntegerBatch<int8_t>(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::SMALLINT:
+      decodeIntegerBatch<int16_t>(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::INTEGER:
+      decodeIntegerBatch<int32_t>(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::BIGINT:
+      decodeIntegerBatch<int64_t>(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::REAL:
+      decodeRealBatch(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::DOUBLE:
+      decodeDoubleBatch(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::HUGEINT:
+      decodeHugeintBatch(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+      decodeVarcharBatch(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::TIMESTAMP:
+      decodeTimestampBatch(dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::ARRAY:
+      decodeArrayBatch(type, dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::MAP:
+      decodeMapBatch(type, dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::ROW:
+      decodeRowBatch(type, dst, out, cursors, rowNulls);
+      return;
+    case TypeKind::UNKNOWN:
+      decodeNullColumnBatch(dst, out, cursors, rowNulls);
+      return;
+    default:
+      BOLT_UNREACHABLE();
+  }
+}
+
+} // namespace bytedance::bolt::row::dense_row
diff --git a/bolt/row/dense/DenseRowGeneralEncode.cpp b/bolt/row/dense/DenseRowGeneralEncode.cpp
new file mode 100644
index 000000000..4c5f2c4a8
--- /dev/null
+++ b/bolt/row/dense/DenseRowGeneralEncode.cpp
@@ -0,0 +1,818 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// General (non-flat) column-batch encode kernels for the dense row format. See
+// DenseRowGeneral.h. Kept in its own TU to isolate code layout from the decode
+// kernels and the flat path; the two Sink instantiations the serializer needs
+// are explicitly instantiated at the bottom.
+
+#include "bolt/row/dense/DenseRowGeneral.h"
+
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+#include "bolt/common/base/Nulls.h"
+#include "bolt/row/dense/IntVarint.h"
+#include "bolt/vector/ComplexVector.h"
+#include "bolt/vector/FlatVector.h"
+
+namespace bytedance::bolt::row::dense_row {
+
+using detail::nullableInt64SerializedSize;
+using detail::varintSize;
+
+template <typename Sink>
+void encodeColumnBatch(
+    const Type& type,
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls);
+
+// Build the per-column plan tree by recursively decoding all nested vectors.
+// Each node holds `source` — the vector it decoded — so the node is
+// self-contained: its `decoded` (a non-owning view) and the held vector live
+// together, independent of who else references the input. For nested types
+// `children` holds the sub-plans (ARRAY -> {elements}, MAP -> {keys, values},
+// ROW -> fields).
+// NOLINTNEXTLINE(misc-no-recursion)
+ColumnPlan buildPlan(const TypePtr& type, const VectorPtr& vector) {
+  ColumnPlan plan;
+  plan.source = vector;
+  plan.kind = type->kind();
+  plan.decoded.decode(*vector);
+  plan.mayHaveNulls = plan.decoded.mayHaveNulls();
+
+  switch (plan.kind) {
+    case TypeKind::ARRAY: {
+      const auto* array = plan.decoded.base()->as<ArrayVector>();
+      BOLT_CHECK_NOT_NULL(array, "buildPlan: ARRAY base is not ArrayVector");
+      plan.children.push_back(buildPlan(type->childAt(0), array->elements()));
+      return plan;
+    }
+    case TypeKind::MAP: {
+      const auto* map = plan.decoded.base()->as<MapVector>();
+      BOLT_CHECK_NOT_NULL(map, "buildPlan: MAP base is not MapVector");
+      plan.children.push_back(buildPlan(type->childAt(0), map->mapKeys()));
+      plan.children.push_back(buildPlan(type->childAt(1), map->mapValues()));
+      return plan;
+    }
+    case TypeKind::ROW: {
+      const auto* row = plan.decoded.base()->as<RowVector>();
+      BOLT_CHECK_NOT_NULL(row, "buildPlan: ROW base is not RowVector");
+      const auto& rowType = type->asRow();
+      plan.children.reserve(rowType.size());
+
+      // For a dict/constant-wrapped ROW, push the outer mapping down onto each
+      // base child so its DecodedVector reads through the wrap. The wrapped
+      // child becomes that child node's `source`. The index buffer is reused
+      // from the input's own wrapInfo() for a single-level dictionary (no
+      // copy); for constant / multi-level it is materialized once (the resolved
+      // indices aren't a standalone input buffer).
+      BufferPtr outerIndices;
+      const auto outerSize = vector->size();
+      if (!plan.decoded.isIdentityMapping()) {
+        if (vector->encoding() == VectorEncoding::Simple::DICTIONARY &&
+            vector->wrapInfo()->as<vector_size_t>() == plan.decoded.indices()) {
+          outerIndices = vector->wrapInfo();
+        } else {
+          outerIndices =
+              AlignedBuffer::allocate<vector_size_t>(outerSize, vector->pool());
+          std::memcpy(
+              outerIndices->asMutable<vector_size_t>(),
+              plan.decoded.indices(),
+              outerSize * sizeof(vector_size_t));
+        }
+      }
+
+      for (size_t i = 0; i < rowType.size(); ++i) {
+        const auto& baseChild = row->childAt(i);
+        if (!baseChild) {
+          ColumnPlan nullPlan;
+          nullPlan.kind = TypeKind::UNKNOWN;
+          nullPlan.isNullColumn = true;
+          plan.children.push_back(std::move(nullPlan));
+          continue;
+        }
+        if (outerIndices) {
+          plan.children.push_back(buildPlan(
+              rowType.childAt(i),
+              BaseVector::wrapInDictionary(
+                  /*nulls=*/BufferPtr{nullptr},
+                  outerIndices,
+                  outerSize,
+                  baseChild)));
+        } else {
+          plan.children.push_back(buildPlan(rowType.childAt(i), baseChild));
+        }
+      }
+      return plan;
+    }
+    default:
+      // Scalar leaves: nothing more to build.
+      return plan;
+  }
+}
+
+// =============================================================================
+// Encode side
+// =============================================================================
+
+// Dedicated nullable-int encoder for any of int8/int16/int32/int64. Fast
+// path (identity-mapped + no nulls + no parentNulls): walks each slot
+// range as a contiguous int sequence and uses SIMD-batched varint sizing
+// on the size pass, plus a tight scalar loop on the write pass.
+template <typename Sink, typename T>
+void encodeIntegerBatchT(
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+  const bool identity = plan.decoded.isIdentityMapping();
+  const auto* raw = plan.decoded.data<T>();
+  const bool fastPath = identity && !mayNulls && in.parentNulls == nullptr;
+
+  if (fastPath) {
+    for (vector_size_t r = 0; r < N; ++r) {
+      if (rowNulls && bits::isBitNull(rowNulls, r)) {
+        continue;
+      }
+      Sink& s = sinks[r];
+      const auto lo = in.rowBoundaries[r];
+      const auto hi = in.rowBoundaries[r + 1];
+      for (uint32_t i = lo; i < hi; ++i) {
+        const auto& sr = in.slots[i];
+        if constexpr (std::is_same_v<Sink, SizeSink>) {
+          s.bytes += detail::sumNullableIntSizes<T>(raw + sr.base, sr.count);
+        } else {
+          const uint32_t end = sr.base + sr.count;
+          for (uint32_t p = sr.base; p < end; ++p) {
+            s.putNullableInt64(static_cast<int64_t>(raw[p]), false);
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t p) {
+      const bool isNull = mayNulls && plan.decoded.isNullAt(p);
+      const int64_t v = isNull
+          ? 0
+          : static_cast<int64_t>(
+                identity ? raw[p] : raw[plan.decoded.index(p)]);
+      s.putNullableInt64(v, isNull);
+    });
+  }
+}
+
+// BOOLEAN: each non-null value emits exactly 1 byte (varint 1 or 2), null
+// emits varint(0) = 1 byte. Total bytes per range = sr.count regardless
+// of value distribution — size pass collapses to a single add.
+template <typename Sink>
+void encodeBooleanBatch(
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+
+  if (in.parentNulls == nullptr) {
+    for (vector_size_t r = 0; r < N; ++r) {
+      if (rowNulls && bits::isBitNull(rowNulls, r)) {
+        continue;
+      }
+      Sink& s = sinks[r];
+      const auto lo = in.rowBoundaries[r];
+      const auto hi = in.rowBoundaries[r + 1];
+      if constexpr (std::is_same_v<Sink, SizeSink>) {
+        size_t total = 0;
+        for (uint32_t i = lo; i < hi; ++i) {
+          total += in.slots[i].count;
+        }
+        s.bytes += total;
+      } else {
+        for (uint32_t i = lo; i < hi; ++i) {
+          const auto& sr = in.slots[i];
+          const uint32_t end = sr.base + sr.count;
+          for (uint32_t p = sr.base; p < end; ++p) {
+            if (mayNulls && plan.decoded.isNullAt(p)) {
+              s.putVarint(0);
+            } else {
+              s.putVarint(plan.decoded.valueAt<bool>(p) ? 2 : 1);
+            }
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t p) {
+      if (mayNulls && plan.decoded.isNullAt(p)) {
+        s.putVarint(0);
+      } else {
+        s.putVarint(plan.decoded.valueAt<bool>(p) ? 2 : 1);
+      }
+    });
+  }
+}
+
+template <typename Sink>
+void encodeRealBatch(
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+  const bool identity = plan.decoded.isIdentityMapping();
+  const auto* raw = plan.decoded.data<float>();
+  const bool fastPath = identity && !mayNulls && in.parentNulls == nullptr;
+
+  if (fastPath) {
+    for (vector_size_t r = 0; r < N; ++r) {
+      if (rowNulls && bits::isBitNull(rowNulls, r)) {
+        continue;
+      }
+      Sink& s = sinks[r];
+      const auto lo = in.rowBoundaries[r];
+      const auto hi = in.rowBoundaries[r + 1];
+      for (uint32_t i = lo; i < hi; ++i) {
+        const auto& sr = in.slots[i];
+        if constexpr (std::is_same_v<Sink, SizeSink>) {
+          s.bytes += static_cast<size_t>(sr.count) * sizeof(uint32_t);
+        } else {
+          const uint32_t end = sr.base + sr.count;
+          for (uint32_t p = sr.base; p < end; ++p) {
+            uint32_t b;
+            std::memcpy(&b, raw + p, sizeof(b));
+            if (FOLLY_UNLIKELY(b == kNullFloatBits)) {
+              b ^= 1u;
+            }
+            s.template putFixed<uint32_t>(b);
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t p) {
+      if (mayNulls && plan.decoded.isNullAt(p)) {
+        s.template putFixed<uint32_t>(kNullFloatBits);
+        return;
+      }
+      const float value = plan.decoded.valueAt<float>(p);
+      uint32_t b;
+      std::memcpy(&b, &value, sizeof(b));
+      // Match v1 collision policy: bit-flip the rare value that aliases
+      // the null sentinel. Inputs whose bits already equal kNullFloatBits^1
+      // round-trip through a single-bit corruption — same lossy behavior
+      // as the v1 wire format.
+      if (FOLLY_UNLIKELY(b == kNullFloatBits)) {
+        b ^= 1u;
+      }
+      s.template putFixed<uint32_t>(b);
+    });
+  }
+}
+
+template <typename Sink>
+void encodeDoubleBatch(
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+  const bool identity = plan.decoded.isIdentityMapping();
+  const auto* raw = plan.decoded.data<double>();
+  const bool fastPath = identity && !mayNulls && in.parentNulls == nullptr;
+
+  if (fastPath) {
+    for (vector_size_t r = 0; r < N; ++r) {
+      if (rowNulls && bits::isBitNull(rowNulls, r)) {
+        continue;
+      }
+      Sink& s = sinks[r];
+      const auto lo = in.rowBoundaries[r];
+      const auto hi = in.rowBoundaries[r + 1];
+      for (uint32_t i = lo; i < hi; ++i) {
+        const auto& sr = in.slots[i];
+        if constexpr (std::is_same_v<Sink, SizeSink>) {
+          s.bytes += static_cast<size_t>(sr.count) * sizeof(uint64_t);
+        } else {
+          const uint32_t end = sr.base + sr.count;
+          for (uint32_t p = sr.base; p < end; ++p) {
+            uint64_t b;
+            std::memcpy(&b, raw + p, sizeof(b));
+            if (FOLLY_UNLIKELY(b == kNullDoubleBits)) {
+              b ^= 1ull;
+            }
+            s.template putFixed<uint64_t>(b);
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t p) {
+      if (mayNulls && plan.decoded.isNullAt(p)) {
+        s.template putFixed<uint64_t>(kNullDoubleBits);
+        return;
+      }
+      const double value = plan.decoded.valueAt<double>(p);
+      uint64_t b;
+      std::memcpy(&b, &value, sizeof(b));
+      if (FOLLY_UNLIKELY(b == kNullDoubleBits)) {
+        b ^= 1ull;
+      }
+      s.template putFixed<uint64_t>(b);
+    });
+  }
+}
+
+template <typename Sink>
+void encodeVarcharBatch(
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+  // Wire layout per row segment: length stream then payload stream.
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t p) {
+      if (mayNulls && plan.decoded.isNullAt(p)) {
+        s.putVarint(0);
+      } else {
+        s.putVarint(
+            static_cast<uint64_t>(plan.decoded.valueAt<StringView>(p).size()) +
+            1);
+      }
+    });
+    forEachLivePos(in, r, [&](uint32_t p) {
+      if (mayNulls && plan.decoded.isNullAt(p)) {
+        return;
+      }
+      const auto sv = plan.decoded.valueAt<StringView>(p);
+      s.putRaw(sv.data(), sv.size());
+    });
+  }
+}
+
+template <typename Sink>
+void encodeTimestampBatch(
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t p) {
+      const bool isNull = mayNulls && plan.decoded.isNullAt(p);
+      s.putNullableInt64(
+          isNull ? 0 : plan.decoded.valueAt<Timestamp>(p).toMicros(), isNull);
+    });
+  }
+}
+
+// HUGEINT wire format: the null marker is folded into the low int64 slot (no
+// separate presence tag). null -> nullableInt64(_, null) (a single 0x00 byte);
+// non-null -> nullableInt64(low 64 of zigzag128(value)), varint(high 64).
+// Small/ medium DECIMAL unscaled values encode in a few bytes (vs the old fixed
+// 16). Mirrors detail::writeNullableInt128.
+template <typename Sink>
+void encodeHugeintBatch(
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t p) {
+      if (mayNulls && plan.decoded.isNullAt(p)) {
+        s.putNullableInt64(0, /*isNull=*/true);
+      } else {
+        const int128_t v = plan.decoded.valueAt<int128_t>(p);
+        const auto zz = detail::zigZagEncode128(v);
+        s.putNullableInt64(
+            static_cast<int64_t>(static_cast<uint64_t>(zz)), /*isNull=*/false);
+        s.putVarint(static_cast<uint64_t>(zz >> 64));
+      }
+    });
+  }
+}
+
+template <typename Sink>
+void encodeNullColumnBatch(
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t /*p*/) { s.putVarint(0); });
+  }
+}
+
+// ARRAY/MAP cardinality emission + (conditionally) child slot tree build.
+//
+// For SizeSink: emits per-slot cardinality varint *sizes* and builds the
+//   child SlotRange/boundaries arrays in `node`.
+// For WriteSink: emits per-slot cardinality *bytes* and skips the build —
+//   `node` was already populated by the prior SizeSink walk.
+//
+// Walks parent positions identically in both passes (the cardinality stream
+// must be byte-for-byte identical between size and write), but the
+// push_back work happens only on the size pass. This roughly halves the
+// per-call slot-tree overhead vs rebuilding on each pass.
+template <typename Sink>
+void encodeArrayLikeCardinalities(
+    const ColumnPlan& plan,
+    const vector_size_t* rawOffsets,
+    const vector_size_t* rawSizes,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls,
+    SlotTreeNode& node) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+  const bool identity = plan.decoded.isIdentityMapping();
+  constexpr bool kBuild = std::is_same_v<Sink, SizeSink>;
+
+  if constexpr (kBuild) {
+    node.slots.clear();
+    node.boundaries.clear();
+    node.boundaries.resize(N + 1);
+    node.boundaries[0] = 0;
+  }
+
+  // Fast path: no nulls on parent vector, no parentNulls bitmap, identity
+  // mapping. Inline the hot loop (cardinality emit + maybe push).
+  if (!mayNulls && identity && in.parentNulls == nullptr) {
+    for (vector_size_t r = 0; r < N; ++r) {
+      if (rowNulls && bits::isBitNull(rowNulls, r)) {
+        if constexpr (kBuild) {
+          node.boundaries[r + 1] = static_cast<uint32_t>(node.slots.size());
+        }
+        continue;
+      }
+      Sink& s = sinks[r];
+      const auto lo = in.rowBoundaries[r];
+      const auto hi = in.rowBoundaries[r + 1];
+      for (uint32_t i = lo; i < hi; ++i) {
+        const auto& sr = in.slots[i];
+        const uint32_t end = sr.base + sr.count;
+        for (uint32_t p = sr.base; p < end; ++p) {
+          const auto sz = static_cast<uint32_t>(rawSizes[p]);
+          s.putVarint(static_cast<uint64_t>(sz) + 1);
+          if constexpr (kBuild) {
+            node.slots.push_back({static_cast<uint32_t>(rawOffsets[p]), sz});
+          }
+        }
+      }
+      if constexpr (kBuild) {
+        node.boundaries[r + 1] = static_cast<uint32_t>(node.slots.size());
+      }
+    }
+    return;
+  }
+
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      if constexpr (kBuild) {
+        node.boundaries[r + 1] = static_cast<uint32_t>(node.slots.size());
+      }
+      continue;
+    }
+    Sink& s = sinks[r];
+    forEachLivePos(in, r, [&](uint32_t p) {
+      if (mayNulls && plan.decoded.isNullAt(p)) {
+        s.putVarint(0);
+        return;
+      }
+      const auto idx =
+          identity ? p : plan.decoded.index(static_cast<vector_size_t>(p));
+      const auto sz = static_cast<uint32_t>(rawSizes[idx]);
+      s.putVarint(static_cast<uint64_t>(sz) + 1);
+      if constexpr (kBuild) {
+        node.slots.push_back({static_cast<uint32_t>(rawOffsets[idx]), sz});
+      }
+    });
+    if constexpr (kBuild) {
+      node.boundaries[r + 1] = static_cast<uint32_t>(node.slots.size());
+    }
+  }
+}
+
+template <typename Sink>
+void encodeArrayBatch(
+    const Type& type,
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto* array = plan.decoded.base()->as<ArrayVector>();
+  // This level owns its child slots in the plan tree: the SizeSink pass builds
+  // them, the WriteSink pass reads them straight back (no cursor/scratch).
+  SlotTreeNode& node = plan.childSlots;
+  if constexpr (std::is_same_v<Sink, SizeSink>) {
+    // Upper bound: total non-null parent slots ≤ elements vector size.
+    node.slots.reserve(array->elements()->size());
+  }
+  encodeArrayLikeCardinalities(
+      plan, array->rawOffsets(), array->rawSizes(), in, sinks, rowNulls, node);
+  SlotView childView{
+      {node.slots.data(), node.slots.size()},
+      {node.boundaries.data(), node.boundaries.size()},
+      nullptr};
+  encodeColumnBatch(
+      *type.childAt(0), plan.children[0], childView, sinks, rowNulls);
+}
+
+template <typename Sink>
+void encodeMapBatch(
+    const Type& type,
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  const auto* map = plan.decoded.base()->as<MapVector>();
+  // One child slot set (built by the SizeSink pass) drives both keys and
+  // values.
+  SlotTreeNode& node = plan.childSlots;
+  if constexpr (std::is_same_v<Sink, SizeSink>) {
+    node.slots.reserve(map->mapKeys()->size());
+  }
+  encodeArrayLikeCardinalities(
+      plan, map->rawOffsets(), map->rawSizes(), in, sinks, rowNulls, node);
+  SlotView childView{
+      {node.slots.data(), node.slots.size()},
+      {node.boundaries.data(), node.boundaries.size()},
+      nullptr};
+  encodeColumnBatch(
+      *type.childAt(0), plan.children[0], childView, sinks, rowNulls);
+  encodeColumnBatch(
+      *type.childAt(1), plan.children[1], childView, sinks, rowNulls);
+}
+
+template <typename Sink>
+void encodeRowBatch(
+    const Type& type,
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls,
+    bool emitMarker) {
+  const auto N = static_cast<vector_size_t>(sinks.size());
+  const bool mayNulls = plan.mayHaveNulls;
+
+  // The subfield slot range equals this ROW's slot range — same positions,
+  // filtered by combined ancestor + this-level nulls. Materialize a bitmap
+  // only if there's anything to filter.
+  const bool needBitmap = mayNulls || in.parentNulls != nullptr;
+  vector_size_t bitmapBits = 0;
+  if (needBitmap) {
+    for (const auto& sr : in.slots) {
+      const auto endPos = static_cast<vector_size_t>(sr.base + sr.count);
+      if (endPos > bitmapBits) {
+        bitmapBits = endPos;
+      }
+    }
+  }
+
+  std::vector<uint64_t> childNullsBuf;
+  const uint64_t* childParentNulls = in.parentNulls;
+  if (needBitmap && bitmapBits > 0) {
+    childNullsBuf.assign(bits::nwords(bitmapBits), ~uint64_t{0});
+    childParentNulls = childNullsBuf.data();
+  }
+
+  // Fast path: no nulls anywhere AND no parentNulls. Every position emits
+  // varint(1) = 1 byte and contributes nothing to childNullsBuf. SizeSink
+  // collapses to a bulk count add; WriteSink writes 1 byte per slot.
+  // When emitMarker is false (top-level non-null contract from caller via
+  // serializeAt), the marker step is skipped entirely; nested ROW levels
+  // always call this with the default true.
+  if (!mayNulls && in.parentNulls == nullptr) {
+    if (emitMarker) {
+      for (vector_size_t r = 0; r < N; ++r) {
+        if (rowNulls && bits::isBitNull(rowNulls, r)) {
+          continue;
+        }
+        Sink& s = sinks[r];
+        const auto lo = in.rowBoundaries[r];
+        const auto hi = in.rowBoundaries[r + 1];
+        if constexpr (std::is_same_v<Sink, SizeSink>) {
+          size_t total = 0;
+          for (uint32_t i = lo; i < hi; ++i) {
+            total += in.slots[i].count;
+          }
+          s.bytes += total;
+        } else {
+          for (uint32_t i = lo; i < hi; ++i) {
+            const auto& sr = in.slots[i];
+            const uint32_t endPos = sr.base + sr.count;
+            for (uint32_t p = sr.base; p < endPos; ++p) {
+              (void)p;
+              s.putVarint(1);
+            }
+          }
+        }
+      }
+    }
+    SlotView childView{in.slots, in.rowBoundaries, childParentNulls};
+    const auto fieldCount = type.size();
+    for (size_t f = 0; f < fieldCount; ++f) {
+      encodeColumnBatch(
+          *type.childAt(f), plan.children[f], childView, sinks, rowNulls);
+    }
+    return;
+  }
+
+  for (vector_size_t r = 0; r < N; ++r) {
+    if (rowNulls && bits::isBitNull(rowNulls, r)) {
+      continue;
+    }
+    Sink& s = sinks[r];
+    const auto lo = in.rowBoundaries[r];
+    const auto hi = in.rowBoundaries[r + 1];
+    for (uint32_t i = lo; i < hi; ++i) {
+      const auto& sr = in.slots[i];
+      const uint32_t endPos = sr.base + sr.count;
+      for (uint32_t p = sr.base; p < endPos; ++p) {
+        const bool parentSaysNull = in.parentNulls &&
+            bits::isBitNull(in.parentNulls, static_cast<int32_t>(p));
+        if (parentSaysNull) {
+          if (!childNullsBuf.empty()) {
+            bits::setNull(childNullsBuf.data(), p, true);
+          }
+          continue;
+        }
+        if (mayNulls && plan.decoded.isNullAt(p)) {
+          s.putVarint(0);
+          if (!childNullsBuf.empty()) {
+            bits::setNull(childNullsBuf.data(), p, true);
+          }
+        } else {
+          s.putVarint(1);
+        }
+      }
+    }
+  }
+
+  SlotView childView{in.slots, in.rowBoundaries, childParentNulls};
+  const auto fieldCount = type.size();
+  for (size_t f = 0; f < fieldCount; ++f) {
+    encodeColumnBatch(
+        *type.childAt(f), plan.children[f], childView, sinks, rowNulls);
+  }
+}
+
+template <typename Sink>
+void encodeColumnBatch(
+    const Type& type,
+    const ColumnPlan& plan,
+    SlotView in,
+    folly::Range<Sink*> sinks,
+    const uint64_t* rowNulls) {
+  if (plan.isNullColumn) {
+    encodeNullColumnBatch(in, sinks, rowNulls);
+    return;
+  }
+  switch (plan.kind) {
+    case TypeKind::BOOLEAN:
+      encodeBooleanBatch(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::TINYINT:
+      encodeIntegerBatchT<Sink, int8_t>(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::SMALLINT:
+      encodeIntegerBatchT<Sink, int16_t>(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::INTEGER:
+      encodeIntegerBatchT<Sink, int32_t>(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::BIGINT:
+      encodeIntegerBatchT<Sink, int64_t>(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::REAL:
+      encodeRealBatch(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::DOUBLE:
+      encodeDoubleBatch(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::HUGEINT:
+      encodeHugeintBatch(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY:
+      encodeVarcharBatch(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::TIMESTAMP:
+      encodeTimestampBatch(plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::ARRAY:
+      encodeArrayBatch(type, plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::MAP:
+      encodeMapBatch(type, plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::ROW:
+      encodeRowBatch(type, plan, in, sinks, rowNulls);
+      return;
+    case TypeKind::UNKNOWN:
+      encodeNullColumnBatch(in, sinks, rowNulls);
+      return;
+    default:
+      BOLT_UNREACHABLE();
+  }
+}
+
+// Explicit instantiations for the two passes (size + write). Keeping the kernel
+// bodies in this TU (not a header) is what isolates their code layout.
+template void encodeColumnBatch<SizeSink>(
+    const Type&,
+    const ColumnPlan&,
+    SlotView,
+    folly::Range<SizeSink*>,
+    const uint64_t*);
+template void encodeColumnBatch<WriteSink>(
+    const Type&,
+    const ColumnPlan&,
+    SlotView,
+    folly::Range<WriteSink*>,
+    const uint64_t*);
+template void encodeRowBatch<SizeSink>(
+    const Type&,
+    const ColumnPlan&,
+    SlotView,
+    folly::Range<SizeSink*>,
+    const uint64_t*,
+    bool);
+template void encodeRowBatch<WriteSink>(
+    const Type&,
+    const ColumnPlan&,
+    SlotView,
+    folly::Range<WriteSink*>,
+    const uint64_t*,
+    bool);
+
+} // namespace bytedance::bolt::row::dense_row
diff --git a/bolt/row/dense/DenseRowScalar.h b/bolt/row/dense/DenseRowScalar.h
new file mode 100644
index 000000000..266a95428
--- /dev/null
+++ b/bolt/row/dense/DenseRowScalar.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <folly/Range.h>
+
+#include "bolt/vector/BaseVector.h"
+#include "bolt/vector/DecodedVector.h"
+
+namespace bytedance::bolt::row::dense_row {
+struct RowCursor;
+}
+
+// Scalar column fast path: a scalar-typed (non ARRAY/MAP/ROW) top-level column
+// has a trivial per-row wire layout `[v]` — no row marker, no cardinality
+// cards, no parent-null filtering — so it is encoded/decoded column-at-a-time,
+// skipping the SlotView machinery entirely. DenseRow routes each scalar
+// top-level field here and each complex field to the general path
+// (DenseRowGeneral.h).
+namespace bytedance::bolt::row::dense_row::scalar {
+
+// Column-at-a-time size accumulation: adds field `dec`'s per-row byte counts
+// into rowSizes[0..N).
+void addColumnSizes(
+    const Type& type,
+    const DecodedVector& dec,
+    vector_size_t N,
+    size_t* rowSizes);
+
+// Column-at-a-time write: appends field `dec`'s bytes through per-row cursors
+// rowCursors[0..N), advancing each.
+void writeColumn(
+    const Type& type,
+    const DecodedVector& dec,
+    vector_size_t N,
+    uint8_t** rowCursors);
+
+// Column-at-a-time read: decodes one scalar value per row from cursors[0..N)
+// into `dst`, advancing each cursor. Inverse of writeColumn
+void readColumn(
+    const Type& type,
+    BaseVector& dst,
+    vector_size_t N,
+    folly::Range<dense_row::RowCursor*> cursors);
+
+} // namespace bytedance::bolt::row::dense_row::scalar
diff --git a/bolt/row/dense/DenseRowScalarDecode.cpp b/bolt/row/dense/DenseRowScalarDecode.cpp
new file mode 100644
index 000000000..d2b0066e5
--- /dev/null
+++ b/bolt/row/dense/DenseRowScalarDecode.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Scalar-row fast path — DECODE side (read). See DenseRowScalar.h.
+//
+// Column-at-a-time, row-major reads: for each scalar field, walk the N per-row
+// cursors and decode one value from each. This is already the fast shape — it
+// skips the SlotView machinery the general decoder uses — and there is no SIMD
+// batch decode because varint parsing is inherently sequential. The varint
+// readers (readVarint / readNullableInt64) use the BMI2 short-fast-path. The
+// input is always marker-less with no top-level null rows, so there is no
+// per-row row-null filtering here.
+//
+// Split from the encode side so the two layout-sensitive scalar hot paths do
+// not perturb each other's code layout.
+
+#include "bolt/row/dense/DenseRowScalar.h"
+
+#include <cstring>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "bolt/row/dense/DenseRowGeneral.h"
+#include "bolt/row/dense/IntVarint.h"
+#include "bolt/vector/ComplexVector.h"
+#include "bolt/vector/FlatVector.h"
+
+namespace bytedance::bolt::row::dense_row::scalar {
+
+template <typename T>
+void readIntColumn(
+    FlatVector<T>* flat,
+    vector_size_t N,
+    folly::Range<RowCursor*> cursors) {
+  auto* raw = flat->mutableRawValues();
+  for (vector_size_t r = 0; r < N; ++r) {
+    RowCursor& c = cursors[r];
+    bool isNull{false};
+    int64_t v{0};
+    [[maybe_unused]] const bool ok = readNullableInt64(c.cur, c.end, isNull, v);
+    BOLT_DCHECK(ok, "DenseRow: malformed integer value at row {}", r);
+    if (isNull) {
+      flat->setNull(r, true);
+    } else {
+      if constexpr (!std::is_same_v<T, int64_t>) {
+        BOLT_DCHECK(
+            v >= static_cast<int64_t>(std::numeric_limits<T>::min()) &&
+                v <= static_cast<int64_t>(std::numeric_limits<T>::max()),
+            "DenseRow: integer value out of range at row {}: {}",
+            r,
+            v);
+      }
+      flat->setNull(r, false);
+      raw[r] = static_cast<T>(v);
+    }
+  }
+}
+
+void readColumn(
+    const Type& type,
+    BaseVector& dst,
+    vector_size_t N,
+    folly::Range<RowCursor*> cursors) {
+  switch (type.kind()) {
+    case TypeKind::BOOLEAN: {
+      auto* flat = dst.asUnchecked<FlatVector<bool>>();
+      for (vector_size_t r = 0; r < N; ++r) {
+        RowCursor& c = cursors[r];
+        uint64_t v{0};
+        [[maybe_unused]] const bool ok = readVarint(c.cur, c.end, v);
+        BOLT_DCHECK(ok, "DenseRow: malformed boolean at row {}", r);
+        if (v == 0) {
+          flat->setNull(r, true);
+        } else {
+          BOLT_DCHECK(
+              v <= 2, "DenseRow: invalid boolean encoding at row {}: {}", r, v);
+          flat->setNull(r, false);
+          flat->set(r, v == 2);
+        }
+      }
+      return;
+    }
+    case TypeKind::UNKNOWN:
+      for (vector_size_t r = 0; r < N; ++r) {
+        RowCursor& c = cursors[r];
+        uint64_t v{0};
+        [[maybe_unused]] const bool ok = readVarint(c.cur, c.end, v);
+        BOLT_DCHECK(ok, "DenseRow: malformed unknown-type marker at row {}", r);
+        BOLT_DCHECK(
+            v == 0, "DenseRow: unknown-type expected null marker at row {}", r);
+        dst.setNull(r, true);
+      }
+      return;
+    case TypeKind::TINYINT:
+      readIntColumn<int8_t>(dst.asUnchecked<FlatVector<int8_t>>(), N, cursors);
+      return;
+    case TypeKind::SMALLINT:
+      readIntColumn<int16_t>(
+          dst.asUnchecked<FlatVector<int16_t>>(), N, cursors);
+      return;
+    case TypeKind::INTEGER:
+      readIntColumn<int32_t>(
+          dst.asUnchecked<FlatVector<int32_t>>(), N, cursors);
+      return;
+    case TypeKind::BIGINT:
+      readIntColumn<int64_t>(
+          dst.asUnchecked<FlatVector<int64_t>>(), N, cursors);
+      return;
+    case TypeKind::REAL: {
+      auto* flat = dst.asUnchecked<FlatVector<float>>();
+      auto* raw = flat->mutableRawValues();
+      for (vector_size_t r = 0; r < N; ++r) {
+        RowCursor& c = cursors[r];
+        BOLT_DCHECK(
+            static_cast<size_t>(c.end - c.cur) >= sizeof(uint32_t),
+            "DenseRow: truncated real at row {}",
+            r);
+        uint32_t b;
+        std::memcpy(&b, c.cur, sizeof(b));
+        c.cur += sizeof(b);
+        if (b == kNullFloatBits) {
+          flat->setNull(r, true);
+        } else {
+          flat->setNull(r, false);
+          if (FOLLY_UNLIKELY(b == (kNullFloatBits ^ 1u))) {
+            b ^= 1u;
+          }
+          std::memcpy(raw + r, &b, sizeof(b));
+        }
+      }
+      return;
+    }
+    case TypeKind::DOUBLE: {
+      auto* flat = dst.asUnchecked<FlatVector<double>>();
+      auto* raw = flat->mutableRawValues();
+      for (vector_size_t r = 0; r < N; ++r) {
+        RowCursor& c = cursors[r];
+        BOLT_DCHECK(
+            static_cast<size_t>(c.end - c.cur) >= sizeof(uint64_t),
+            "DenseRow: truncated double at row {}",
+            r);
+        uint64_t b;
+        std::memcpy(&b, c.cur, sizeof(b));
+        c.cur += sizeof(b);
+        if (b == kNullDoubleBits) {
+          flat->setNull(r, true);
+        } else {
+          flat->setNull(r, false);
+          if (FOLLY_UNLIKELY(b == (kNullDoubleBits ^ 1ull))) {
+            b ^= 1ull;
+          }
+          std::memcpy(raw + r, &b, sizeof(b));
+        }
+      }
+      return;
+    }
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY: {
+      auto* flat = dst.asUnchecked<FlatVector<StringView>>();
+      auto* rawValues = flat->mutableRawValues();
+      // Inline values (<= kInlineSize) live in the StringView itself; longer
+      // ones are copied into the vector's string buffer, carved from fixed-size
+      // chunks (a string larger than a chunk gets its own exact allocation).
+      // Fixed chunks avoid both a look-ahead scan of the remaining cursor bytes
+      // and a getRawStringBufferWithSpace call per non-inline value.
+      constexpr size_t kStringChunk = 32 * 1024;
+      char* heap = nullptr;
+      size_t heapRemaining = 0;
+      for (vector_size_t r = 0; r < N; ++r) {
+        RowCursor& c = cursors[r];
+        uint64_t lenPlus{0};
+        // Always-on: the length and payload-bounds guards gate a memcpy of a
+        // wire-controlled length, so a corrupt/truncated value must fail loudly
+        // rather than over-read the input buffer (matches the general decoder).
+        BOLT_USER_CHECK(
+            readVarint(c.cur, c.end, lenPlus),
+            "DenseRow: malformed varchar length at row {}",
+            r);
+        if (lenPlus == 0) {
+          flat->setNull(r, true);
+          continue;
+        }
+        const auto len = static_cast<size_t>(lenPlus - 1);
+        BOLT_USER_CHECK(
+            static_cast<size_t>(c.end - c.cur) >= len,
+            "DenseRow: truncated varchar payload at row {}",
+            r);
+        flat->setNull(r, false);
+        if (len <= StringView::kInlineSize) {
+          rawValues[r] = StringView(reinterpret_cast<const char*>(c.cur), len);
+        } else {
+          if (heapRemaining < len) {
+            const size_t alloc = len > kStringChunk ? len : kStringChunk;
+            heap = flat->getRawStringBufferWithSpace(alloc, /*exactSize=*/true);
+            heapRemaining = alloc;
+          }
+          std::memcpy(heap, c.cur, len);
+          rawValues[r] = StringView(heap, len);
+          heap += len;
+          heapRemaining -= len;
+        }
+        c.cur += len;
+      }
+      return;
+    }
+    case TypeKind::TIMESTAMP: {
+      auto* flat = dst.asUnchecked<FlatVector<Timestamp>>();
+      for (vector_size_t r = 0; r < N; ++r) {
+        RowCursor& c = cursors[r];
+        bool isNull{false};
+        int64_t micros{0};
+        [[maybe_unused]] const bool ok =
+            readNullableInt64(c.cur, c.end, isNull, micros);
+        BOLT_DCHECK(ok, "DenseRow: malformed timestamp at row {}", r);
+        if (isNull) {
+          flat->setNull(r, true);
+        } else {
+          flat->setNull(r, false);
+          flat->set(r, Timestamp::fromMicrosNoError(micros));
+        }
+      }
+      return;
+    }
+    case TypeKind::HUGEINT: {
+      auto* flat = dst.asUnchecked<FlatVector<int128_t>>();
+      auto* raw = flat->mutableRawValues();
+      for (vector_size_t r = 0; r < N; ++r) {
+        RowCursor& c = cursors[r];
+        bool isNull{false};
+        int128_t v{0};
+        [[maybe_unused]] const bool ok =
+            readNullableInt128(c.cur, c.end, isNull, v);
+        BOLT_DCHECK(ok, "DenseRow: malformed hugeint at row {}", r);
+        if (isNull) {
+          flat->setNull(r, true);
+        } else {
+          flat->setNull(r, false);
+          raw[r] = v;
+        }
+      }
+      return;
+    }
+    default:
+      BOLT_UNREACHABLE();
+  }
+}
+
+} // namespace bytedance::bolt::row::dense_row::scalar
diff --git a/bolt/row/dense/DenseRowScalarEncode.cpp b/bolt/row/dense/DenseRowScalarEncode.cpp
new file mode 100644
index 000000000..799f2f2ae
--- /dev/null
+++ b/bolt/row/dense/DenseRowScalarEncode.cpp
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Scalar-row fast path — ENCODE side (size + write). See
+// DenseRowScalar.h. Split from the decode side so the two layout-sensitive
+// scalar hot paths do not perturb each other's code layout.
+
+#include "bolt/row/dense/DenseRowScalar.h"
+
+#include <cstring>
+
+#include "bolt/row/dense/DenseRowGeneral.h"
+#include "bolt/row/dense/IntVarint.h"
+
+namespace bytedance::bolt::row::dense_row::scalar {
+
+template <typename T>
+FOLLY_ALWAYS_INLINE void
+addIntColumnSizes(const DecodedVector& dec, vector_size_t N, size_t* rowSizes) {
+  const auto* raw = dec.data<T>();
+  if (dec.isIdentityMapping()) {
+    // Flat column. A null rawNulls() means no nulls (mayHaveNulls() is only a
+    // conservative upper bound), so this also covers the "may-null flag set
+    // without a backing bitmap" case. All int widths handled: int64 directly,
+    // int8/int16/int32 via int32.
+    const uint64_t* nulls =
+        dec.mayHaveNulls() ? dec.base()->rawNulls() : nullptr;
+    if (nulls) {
+      // SIMD value sizes + branchless null override via the row-indexed
+      // validity bitmap.
+      detail::addNullableIntColumnSizes(
+          raw, nulls, rowSizes, static_cast<size_t>(N));
+    } else {
+      // Contiguous, no value-nulls: portable xsimd size kernel scattered
+      // per row.
+      detail::addNoNullIntColumnSizes<T>(raw, rowSizes, static_cast<size_t>(N));
+    }
+    return;
+  }
+  if (dec.isConstantMapping()) {
+    // Every row maps to the same base value (or all rows are null), so the
+    // serialized size is identical — compute it once and splat across rows.
+    const bool isNull = dec.mayHaveNulls() && dec.isNullAt(0);
+    const int64_t v = isNull ? 0 : static_cast<int64_t>(raw[dec.index(0)]);
+    const size_t sz = detail::nullableInt64SerializedSize(v, isNull);
+    for (vector_size_t r = 0; r < N; ++r) {
+      rowSizes[r] += sz;
+    }
+    return;
+  }
+  // General path: dictionary mappings.
+  const bool mayNulls = dec.mayHaveNulls();
+  for (vector_size_t r = 0; r < N; ++r) {
+    const bool isNull = mayNulls && dec.isNullAt(r);
+    const int64_t v = isNull ? 0 : static_cast<int64_t>(raw[dec.index(r)]);
+    rowSizes[r] += detail::nullableInt64SerializedSize(v, isNull);
+  }
+}
+
+template <size_t KBYTES>
+FOLLY_ALWAYS_INLINE void addFixedColumnSizes(
+    vector_size_t N,
+    size_t* rowSizes) {
+  for (vector_size_t r = 0; r < N; ++r) {
+    rowSizes[r] += KBYTES;
+  }
+}
+
+void addColumnSizes(
+    const Type& type,
+    const DecodedVector& dec,
+    vector_size_t N,
+    size_t* rowSizes) {
+  switch (type.kind()) {
+    case TypeKind::BOOLEAN:
+    case TypeKind::UNKNOWN:
+      addFixedColumnSizes<1>(N, rowSizes);
+      return;
+    case TypeKind::TINYINT:
+      addIntColumnSizes<int8_t>(dec, N, rowSizes);
+      return;
+    case TypeKind::SMALLINT:
+      addIntColumnSizes<int16_t>(dec, N, rowSizes);
+      return;
+    case TypeKind::INTEGER:
+      addIntColumnSizes<int32_t>(dec, N, rowSizes);
+      return;
+    case TypeKind::BIGINT:
+      addIntColumnSizes<int64_t>(dec, N, rowSizes);
+      return;
+    case TypeKind::REAL:
+      addFixedColumnSizes<sizeof(float)>(N, rowSizes);
+      return;
+    case TypeKind::DOUBLE:
+      addFixedColumnSizes<sizeof(double)>(N, rowSizes);
+      return;
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY: {
+      const bool mayNulls = dec.mayHaveNulls();
+      for (vector_size_t r = 0; r < N; ++r) {
+        if (mayNulls && dec.isNullAt(r)) {
+          ++rowSizes[r];
+        } else {
+          const auto len = dec.valueAt<StringView>(r).size();
+          rowSizes[r] +=
+              detail::varintSize(static_cast<uint64_t>(len) + 1) + len;
+        }
+      }
+      return;
+    }
+    case TypeKind::TIMESTAMP: {
+      const bool mayNulls = dec.mayHaveNulls();
+      for (vector_size_t r = 0; r < N; ++r) {
+        if (mayNulls && dec.isNullAt(r)) {
+          ++rowSizes[r];
+        } else {
+          rowSizes[r] += detail::nullableInt64SerializedSize(
+              dec.valueAt<Timestamp>(r).toMicros(), false);
+        }
+      }
+      return;
+    }
+    case TypeKind::HUGEINT: {
+      // null folded into the low slot: nullableInt64(zigzag128 low 64), then
+      // varint(high 64) if non-null. See detail::writeNullableInt128.
+      const bool mayNulls = dec.mayHaveNulls();
+      for (vector_size_t r = 0; r < N; ++r) {
+        const bool isNull = mayNulls && dec.isNullAt(r);
+        rowSizes[r] += detail::nullableInt128SerializedSize(
+            isNull ? int128_t{0} : dec.valueAt<int128_t>(r), isNull);
+      }
+      return;
+    }
+    default:
+      BOLT_UNREACHABLE();
+  }
+}
+
+template <typename T>
+FOLLY_ALWAYS_INLINE void writeIntColumn(
+    const DecodedVector& dec,
+    vector_size_t N,
+    uint8_t** rowCursors) {
+  const bool mayNulls = dec.mayHaveNulls();
+  const bool identity = dec.isIdentityMapping();
+  const auto* raw = dec.data<T>();
+  for (vector_size_t r = 0; r < N; ++r) {
+    const bool isNull = mayNulls && dec.isNullAt(r);
+    const int64_t v = isNull
+        ? 0
+        : static_cast<int64_t>(identity ? raw[r] : raw[dec.index(r)]);
+    rowCursors[r] = writeNullableInt64(v, isNull, rowCursors[r]);
+  }
+}
+
+void writeColumn(
+    const Type& type,
+    const DecodedVector& dec,
+    vector_size_t N,
+    uint8_t** rowCursors) {
+  switch (type.kind()) {
+    case TypeKind::BOOLEAN: {
+      const bool mayNulls = dec.mayHaveNulls();
+      for (vector_size_t r = 0; r < N; ++r) {
+        if (mayNulls && dec.isNullAt(r)) {
+          *rowCursors[r]++ = uint8_t{0};
+        } else {
+          *rowCursors[r]++ = dec.valueAt<bool>(r) ? uint8_t{2} : uint8_t{1};
+        }
+      }
+      return;
+    }
+    case TypeKind::UNKNOWN:
+      for (vector_size_t r = 0; r < N; ++r) {
+        *rowCursors[r]++ = uint8_t{0};
+      }
+      return;
+    case TypeKind::TINYINT:
+      writeIntColumn<int8_t>(dec, N, rowCursors);
+      return;
+    case TypeKind::SMALLINT:
+      writeIntColumn<int16_t>(dec, N, rowCursors);
+      return;
+    case TypeKind::INTEGER:
+      writeIntColumn<int32_t>(dec, N, rowCursors);
+      return;
+    case TypeKind::BIGINT:
+      writeIntColumn<int64_t>(dec, N, rowCursors);
+      return;
+    case TypeKind::REAL: {
+      const bool mayNulls = dec.mayHaveNulls();
+      const bool identity = dec.isIdentityMapping();
+      const auto* raw = dec.data<float>();
+      for (vector_size_t r = 0; r < N; ++r) {
+        uint32_t b;
+        if (mayNulls && dec.isNullAt(r)) {
+          b = kNullFloatBits;
+        } else {
+          const float value = identity ? raw[r] : raw[dec.index(r)];
+          std::memcpy(&b, &value, sizeof(b));
+          // kNullFloatBits is the canonical quiet NaN. Flipping the low
+          // mantissa bit yields another NaN.
+          if (FOLLY_UNLIKELY(b == kNullFloatBits)) {
+            b ^= 1u;
+          }
+        }
+        std::memcpy(rowCursors[r], &b, sizeof(b));
+        rowCursors[r] += sizeof(b);
+      }
+      return;
+    }
+    case TypeKind::DOUBLE: {
+      const bool mayNulls = dec.mayHaveNulls();
+      const bool identity = dec.isIdentityMapping();
+      const auto* raw = dec.data<double>();
+      for (vector_size_t r = 0; r < N; ++r) {
+        uint64_t b;
+        if (mayNulls && dec.isNullAt(r)) {
+          b = kNullDoubleBits;
+        } else {
+          const double value = identity ? raw[r] : raw[dec.index(r)];
+          std::memcpy(&b, &value, sizeof(b));
+          // kNullDoubleBits is the canonical quiet NaN. Flipping the low
+          // mantissa bit yields another NaN.
+          if (FOLLY_UNLIKELY(b == kNullDoubleBits)) {
+            b ^= 1ull;
+          }
+        }
+        std::memcpy(rowCursors[r], &b, sizeof(b));
+        rowCursors[r] += sizeof(b);
+      }
+      return;
+    }
+    case TypeKind::VARCHAR:
+    case TypeKind::VARBINARY: {
+      const bool mayNulls = dec.mayHaveNulls();
+      for (vector_size_t r = 0; r < N; ++r) {
+        uint8_t* out = rowCursors[r];
+        if (mayNulls && dec.isNullAt(r)) {
+          out = writeVarint(0, out);
+        } else {
+          const auto sv = dec.valueAt<StringView>(r);
+          out = writeVarint(static_cast<uint64_t>(sv.size()) + 1, out);
+          std::memcpy(out, sv.data(), sv.size());
+          out += sv.size();
+        }
+        rowCursors[r] = out;
+      }
+      return;
+    }
+    case TypeKind::TIMESTAMP: {
+      const bool mayNulls = dec.mayHaveNulls();
+      for (vector_size_t r = 0; r < N; ++r) {
+        const bool isNull = mayNulls && dec.isNullAt(r);
+        const int64_t v = isNull ? 0 : dec.valueAt<Timestamp>(r).toMicros();
+        rowCursors[r] = writeNullableInt64(v, isNull, rowCursors[r]);
+      }
+      return;
+    }
+    case TypeKind::HUGEINT: {
+      const bool mayNulls = dec.mayHaveNulls();
+      for (vector_size_t r = 0; r < N; ++r) {
+        const bool isNull = mayNulls && dec.isNullAt(r);
+        rowCursors[r] = writeNullableInt128(
+            isNull ? int128_t{0} : dec.valueAt<int128_t>(r),
+            isNull,
+            rowCursors[r]);
+      }
+      return;
+    }
+    default:
+      BOLT_UNREACHABLE();
+  }
+}
+
+} // namespace bytedance::bolt::row::dense_row::scalar
diff --git a/bolt/row/dense/IntVarint.h b/bolt/row/dense/IntVarint.h
new file mode 100644
index 000000000..fba442f47
--- /dev/null
+++ b/bolt/row/dense/IntVarint.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#include <folly/CPortability.h>
+#include <folly/Likely.h>
+#include <xsimd/xsimd.hpp>
+
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64)
+#include <immintrin.h>
+#endif
+
+#include "bolt/common/base/BitUtil.h"
+#include "bolt/type/HugeInt.h"
+
+// Integer varint codec for the dense row format. Organized bottom-up in five
+// layers; each layer only calls the ones below it:
+//
+//   L1  raw varint        — LEB128 read/write (scalar / BMI2 / dispatchers)
+//   L2  zigzag            — 64/128-bit sign-folding primitives
+//   L3  nullable codec    — single-value wire mapping: null = 0x00, INT64_MIN
+//                           sentinel, else varint(zigzag(adjust(v)))
+//   L4  SIMD size kernels — per-batch encoded-size computation (xsimd lanes)
+//   L5  column-level      — whole-column size sum / per-row scatter loops
+//
+// Naming: a `*Batch` suffix marks an L4 pure kernel over one SIMD batch;
+// un-suffixed L5 functions loop over a whole array. The scalar L3 size math and
+// the L4 SIMD kernels intentionally duplicate the same formula — SIMD main
+// loops need a scalar tail, and encode correctness relies on the two agreeing.
+namespace bytedance::bolt::row::detail {
+
+// =============================================================================
+// L1 — Raw varint (LEB128): scalar + BMI2 implementations and dispatchers.
+// =============================================================================
+
+// A varint byte carries 7 payload bits; the high bit (0x80) is the
+// continuation flag. The final byte of a varint is the one with it clear.
+constexpr uint64_t kVarintPayloadBits{0x7f};
+// 8-byte-wide versions of the payload / continuation bit patterns, for the
+// BMI2 pdep/pext paths that process 8 wire bytes per step.
+constexpr uint64_t kVarintPayloadMask64{0x7f7f7f7f7f7f7f7fULL};
+constexpr uint64_t kVarintContinuationMask64{0x8080808080808080ULL};
+
+FOLLY_ALWAYS_INLINE bool varintIsLastByte(uint8_t b) {
+  return (b & 0x80) == 0;
+}
+
+FOLLY_ALWAYS_INLINE size_t varintSize(uint64_t value) {
+  const auto bits = 64 - __builtin_clzll(value | 1ULL);
+  return static_cast<size_t>((bits + 6) / 7);
+}
+
+FOLLY_ALWAYS_INLINE uint8_t* writeVarintScalar(uint64_t value, uint8_t* out) {
+  while (value >= 0x80) {
+    *out++ = static_cast<uint8_t>(value) | 0x80;
+    value >>= 7;
+  }
+  *out++ = static_cast<uint8_t>(value);
+  return out;
+}
+
+// No bounds check: reads until the terminator, capped at 10 bytes by the
+// varint structural limit (shift < 64). Over-reads only on malformed input;
+// readVarint's single in <= end check validates the consumed length.
+FOLLY_ALWAYS_INLINE bool readVarintScalar(const uint8_t*& in, uint64_t& value) {
+  uint64_t result{0};
+  uint32_t shift{0};
+  while (shift < 64) {
+    auto byte = *in++;
+    result |= ((byte & kVarintPayloadBits) << shift);
+    if (varintIsLastByte(byte)) {
+      value = result;
+      return true;
+    }
+    shift += 7;
+  }
+  return false;
+}
+
+#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
+
+constexpr std::array<uint64_t, 9> makeVarintContinuationMasks() {
+  std::array<uint64_t, 9> masks{};
+  for (size_t len = 1; len < masks.size(); ++len) {
+    uint64_t mask{0};
+    for (size_t i = 0; i + 1 < len; ++i) {
+      mask |= (0x80ULL << (i * 8));
+    }
+    masks[len] = mask;
+  }
+  return masks;
+}
+
+inline constexpr std::array<uint64_t, 9> kVarintContinuationMasks =
+    makeVarintContinuationMasks();
+
+inline __attribute__((target("bmi2"))) uint8_t* writeVarintBmi2(
+    uint64_t value,
+    uint8_t* out) {
+  if (value < (1ULL << 56)) {
+    const auto bits = 64 - __builtin_clzll(value | 1ULL);
+    const auto len = static_cast<size_t>((bits + 6) / 7);
+
+    uint64_t packed = _pdep_u64(value, kVarintPayloadMask64);
+    // _pdep places only the 7 data bits. Set continuation bits (MSB=1) for
+    // the first len - 1 bytes; the last byte keeps MSB=0.
+    packed |= kVarintContinuationMasks[len];
+    std::memcpy(out, &packed, len);
+    return out + len;
+  }
+
+  // Values >= 2^63 require 10 bytes in unsigned varint form (e.g.
+  // zigzag(INT64_MAX) == 2^64 - 2). Encode the first 8 bytes with BMI2,
+  // then encode the remaining <=8 bits with scalar (1-2 bytes).
+  uint64_t packed = _pdep_u64(value, kVarintPayloadMask64);
+  packed |= kVarintContinuationMask64;
+  std::memcpy(out, &packed, 8);
+  out += 8;
+  return writeVarintScalar(value >> 56, out);
+}
+
+inline __attribute__((target("bmi2"))) bool
+readVarintBmi2(const uint8_t*& in, const uint8_t* end, uint64_t& value) {
+  // `end - in >= 8` is a memory-safety guard, NOT a redundant validity check:
+  // the 8-byte bulk load below would read past the buffer for a valid 5-7 byte
+  // varint in the final bytes (buffers are sized exactly, no tail padding).
+  // With < 8 bytes left, fall to the byte-at-a-time scalar reader. Truncation
+  // is caught by readVarint's single in <= end check.
+  if (end - in >= 8) {
+    uint64_t word;
+    std::memcpy(&word, in, sizeof(word));
+
+    const uint64_t stopMask = (~word) & kVarintContinuationMask64;
+    if (stopMask != 0) {
+      const auto len =
+          static_cast<size_t>((__builtin_ctzll(stopMask) >> 3) + 1);
+      uint64_t decoded = _pext_u64(word, kVarintPayloadMask64);
+      if (len < 8) {
+        decoded &= ((1ULL << (len * 7)) - 1);
+      }
+      value = decoded;
+      in += len;
+      return true;
+    }
+
+    // 9-10 byte varint: the first 8 bytes all continue, so a well-formed varint
+    // has its terminator within in[8..9] (in-bounds when end - in >= 9/10). No
+    // per-byte truncation check; readVarint's in <= end catches a short input.
+    uint64_t decoded = _pext_u64(word, kVarintPayloadMask64);
+    auto* cursor = in + 8;
+    const auto byte8 = *cursor++;
+    decoded |= (static_cast<uint64_t>(byte8 & 0x7f) << 56);
+    if ((byte8 & 0x80) == 0) {
+      value = decoded;
+      in = cursor;
+      return true;
+    }
+
+    const auto byte9 = *cursor++;
+    decoded |= (static_cast<uint64_t>(byte9 & 0x1) << 63);
+    value = decoded;
+    in = cursor;
+    return true;
+  }
+
+  return readVarintScalar(in, value);
+}
+
+#endif
+
+// Inlined fast path for varints up to 3 bytes (values 0..2^21-1 = 2_097_151).
+// Covers the dominant cases in null-fused encodings:
+//   - row markers (varint(0/1) → 1 byte)
+//   - VARCHAR lengths up to ~2M
+//   - BIGINT values in [-2^20, 2^20-1] after zigzag+adjust (covers lt_2pow8,
+//     lt_2pow16, and ~half of lt_2pow32 entries)
+//   - ARRAY/MAP cardinalities 0..2_097_151
+//
+// The BMI2 path costs an 8-byte load + tzcnt + pext (10-15 cycle dep chain).
+// Inlining up to 3 byte checks (each ~2 cycles) keeps the dep chain short
+// and lets the OoO window see much more parallelism across rows.
+//
+// On failure (4+ byte varint or truncated input), caller falls back to
+// BMI2/scalar. No bounds checks here: a well-formed varint stops at its
+// terminator within the buffer; reads run past `end` only on malformed input,
+// bounded to 4 bytes, and readVarint's single in <= end check validates the
+// consumed length.
+// Each length below reconstructs its whole value inside its own return, so the
+// fall-through path (5+ byte varint) does no value arithmetic and the 1-byte
+// case skips the payload mask — this is why it stays hand-unrolled rather than
+// looped (a loop that accumulated the value each step measured ~1% slower on
+// decode). Earlier bytes (b0..b{k-1}) are known to carry continuation bits, so
+// only their payload (low 7) bits contribute; the terminating byte is whole.
+FOLLY_ALWAYS_INLINE bool readVarintShortFastPath(
+    const uint8_t*& in,
+    uint64_t& value) {
+  constexpr uint64_t kP = kVarintPayloadBits;
+
+  const uint8_t b0 = in[0];
+  if (FOLLY_LIKELY(varintIsLastByte(b0))) { // 1 byte (< 2^7)
+    value = b0;
+    in += 1;
+    return true;
+  }
+
+  const uint8_t b1 = in[1];
+  if (FOLLY_LIKELY(varintIsLastByte(b1))) { // 2 bytes (< 2^14)
+    value = (b0 & kP) | (uint64_t{b1} << 7);
+    in += 2;
+    return true;
+  }
+
+  const uint8_t b2 = in[2];
+  if (FOLLY_LIKELY(varintIsLastByte(b2))) { // 3 bytes (< 2^21)
+    value = (b0 & kP) | ((b1 & kP) << 7) | (uint64_t{b2} << 14);
+    in += 3;
+    return true;
+  }
+
+  const uint8_t b3 = in[3];
+  if (FOLLY_LIKELY(varintIsLastByte(b3))) { // 4 bytes (< 2^28)
+    value =
+        (b0 & kP) | ((b1 & kP) << 7) | ((b2 & kP) << 14) | (uint64_t{b3} << 21);
+    in += 4;
+    return true;
+  }
+  return false;
+}
+
+FOLLY_ALWAYS_INLINE uint8_t* writeVarint(uint64_t value, uint8_t* out) {
+  if (value < 0x80) {
+    *out++ = static_cast<uint8_t>(value);
+    return out;
+  }
+
+#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
+  if (value >= (1ULL << 35)) {
+    return writeVarintBmi2(value, out);
+  }
+#endif
+  return writeVarintScalar(value, out);
+}
+
+// Bounds are validated once here: the inner readers parse optimistically (no
+// per-byte end checks) and may run a few bytes past `end` on malformed input;
+// the single in <= end check rejects any read that over-ran the buffer.
+FOLLY_ALWAYS_INLINE bool
+readVarint(const uint8_t*& in, const uint8_t* end, uint64_t& value) {
+  if (FOLLY_UNLIKELY(in >= end)) {
+    return false;
+  }
+  if (FOLLY_LIKELY(readVarintShortFastPath(in, value))) {
+    return in <= end;
+  }
+#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
+  return readVarintBmi2(in, end, value) && in <= end;
+#else
+  return readVarintScalar(in, value) && in <= end;
+#endif
+}
+
+// =============================================================================
+// L2 — ZigZag sign folding (64- and 128-bit).
+// =============================================================================
+
+FOLLY_ALWAYS_INLINE constexpr uint64_t zigZagEncode64(int64_t value) {
+  return (static_cast<uint64_t>(value) << 1) ^
+      static_cast<uint64_t>(value >> 63);
+}
+
+FOLLY_ALWAYS_INLINE constexpr int64_t zigZagDecode64(uint64_t encoded) {
+  return static_cast<int64_t>((encoded >> 1) ^ (0 - (encoded & 1)));
+}
+
+FOLLY_ALWAYS_INLINE constexpr uint128_t zigZagEncode128(int128_t value) {
+  return (static_cast<uint128_t>(value) << 1) ^
+      static_cast<uint128_t>(value >> 127);
+}
+
+FOLLY_ALWAYS_INLINE constexpr int128_t zigZagDecode128(uint128_t encoded) {
+  return static_cast<int128_t>(
+      (encoded >> 1) ^ (~static_cast<uint128_t>(0) * (encoded & 1)));
+}
+
+// =============================================================================
+// L3 — Nullable-int wire codec (single value).
+// =============================================================================
+
+// Nullable int64 wire mapping:
+// - null -> 0x00.
+// - INT64_MIN -> 0x80 0x00 (reserved sentinel).
+// - all other values -> varint(zigzag(adjust(v))), where adjust(v) keeps
+//   positive values unchanged and shifts non-positive values by -1.
+//
+// The reserved sentinel keeps null as a single-byte marker while preserving a
+// one-to-one mapping for the full int64 domain.
+constexpr std::array<uint8_t, 2> kInt64MinSentinel{0x80, 0x00};
+
+FOLLY_ALWAYS_INLINE constexpr bool needsExtendedInt64Encoding(int64_t value) {
+  return value == std::numeric_limits<int64_t>::min();
+}
+
+FOLLY_ALWAYS_INLINE constexpr int64_t adjustInt64ForNullableEncoding(
+    int64_t value) {
+  return value > 0 ? value : value - 1;
+}
+
+FOLLY_ALWAYS_INLINE constexpr int64_t restoreInt64FromNullableEncoding(
+    int64_t value) {
+  return value > 0 ? value : value + 1;
+}
+
+FOLLY_ALWAYS_INLINE size_t
+nullableInt64SerializedSize(int64_t value, bool isNull) {
+  if (isNull) {
+    return 1;
+  }
+
+  if (needsExtendedInt64Encoding(value)) {
+    return 2;
+  }
+
+  // size == ceil((bitlen(|v|)+1)/7) == varintSize(zigzag(adjust(v))) — the wire
+  // mapping only moves which 2^(7k-1) bucket the value lands in, which |v|
+  // already captures, so we skip the zigzag/adjust and clz |v| directly.
+  // INT64_MIN is excluded above, so the unsigned abs is exact.
+  const auto uv = static_cast<uint64_t>(value);
+  const auto sign = static_cast<uint64_t>(value >> 63);
+  const uint64_t mag = (uv ^ sign) - sign; // |value|, no signed-overflow UB
+  const auto bits = 64 - __builtin_clzll(mag | 1ULL);
+  return static_cast<size_t>((bits + 7) / 7);
+}
+
+FOLLY_ALWAYS_INLINE uint8_t*
+writeNullableInt64(int64_t value, bool isNull, uint8_t* out) {
+  if (isNull) {
+    *out++ = 0;
+    return out;
+  }
+
+  if (FOLLY_UNLIKELY(needsExtendedInt64Encoding(value))) {
+    *out++ = kInt64MinSentinel[0];
+    *out++ = kInt64MinSentinel[1];
+    return out;
+  }
+
+  return writeVarint(
+      zigZagEncode64(adjustInt64ForNullableEncoding(value)), out);
+}
+
+FOLLY_ALWAYS_INLINE bool readNullableInt64(
+    const uint8_t*& in,
+    const uint8_t* end,
+    bool& isNull,
+    int64_t& value) {
+  if (FOLLY_UNLIKELY(in >= end)) {
+    return false;
+  }
+  if (*in == 0) {
+    ++in;
+    isNull = true;
+    value = 0;
+    return in <= end;
+  }
+
+  if (FOLLY_UNLIKELY(
+          in[0] == kInt64MinSentinel[0] && in[1] == kInt64MinSentinel[1])) {
+    in += 2;
+    isNull = false;
+    value = std::numeric_limits<int64_t>::min();
+    return in <= end;
+  }
+
+  uint64_t encoded{0};
+  if (!readVarint(in, end, encoded)) {
+    return false;
+  }
+
+  isNull = false;
+  value = restoreInt64FromNullableEncoding(zigZagDecode64(encoded));
+  return in <= end;
+}
+
+// Nullable int128 wire mapping (two halves of zigzag128(v), no separate tag):
+//   null     -> nullableInt64(_, null)            (a single 0x00 byte)
+//   non-null -> nullableInt64(low 64 of zigzag128(v)), varint(high 64).
+// The null marker is folded into the low int64 slot via the nullable-int64
+// codec's own 0x00 sentinel, so there is no extra present/null tag byte: a
+// non-null value's low half just rides the same slot, and the high half follows
+// only when present. zigzag128 keeps small-magnitude values (either sign)
+// short, and the two halves reuse the 64-bit varint path (no 128-bit varint).
+// The low half is reinterpreted as int64 for the nullable-int64 codec; that
+// round-trips bit-for-bit (it is a bijection over int64 plus null).
+FOLLY_ALWAYS_INLINE size_t
+nullableInt128SerializedSize(int128_t value, bool isNull) {
+  if (isNull) {
+    return nullableInt64SerializedSize(0, /*isNull=*/true);
+  }
+  const uint128_t zz = zigZagEncode128(value);
+  return nullableInt64SerializedSize(
+             static_cast<int64_t>(static_cast<uint64_t>(zz)),
+             /*isNull=*/false) +
+      varintSize(static_cast<uint64_t>(zz >> 64));
+}
+
+FOLLY_ALWAYS_INLINE uint8_t*
+writeNullableInt128(int128_t value, bool isNull, uint8_t* out) {
+  if (isNull) {
+    return writeNullableInt64(0, /*isNull=*/true, out);
+  }
+  const uint128_t zz = zigZagEncode128(value);
+  out = writeNullableInt64(
+      static_cast<int64_t>(static_cast<uint64_t>(zz)), /*isNull=*/false, out);
+  return writeVarint(static_cast<uint64_t>(zz >> 64), out);
+}
+
+FOLLY_ALWAYS_INLINE bool readNullableInt128(
+    const uint8_t*& in,
+    const uint8_t* end,
+    bool& isNull,
+    int128_t& value) {
+  int64_t low{0};
+  if (!readNullableInt64(in, end, isNull, low)) {
+    return false;
+  }
+  if (isNull) {
+    value = 0;
+    return true;
+  }
+  uint64_t hi{0};
+  if (!readVarint(in, end, hi)) {
+    return false;
+  }
+  value = zigZagDecode128(
+      (static_cast<uint128_t>(hi) << 64) | static_cast<uint64_t>(low));
+  return true;
+}
+
+// =============================================================================
+// L4 — Portable (xsimd) nullable-int size kernels: encoded byte counts for one
+// SIMD batch of values. Two kernels: int32 and int64; int8/int16 widen to
+// int32. The size math is pure xsimd::batch, so it runs on AVX2 / AVX-512 /
+// SSE / NEON (compatibility). These must agree with the scalar
+// nullableInt64SerializedSize above.
+// =============================================================================
+
+// int32: width-adaptive, native uint32 lanes. zigzag of an int32 fits uint32
+// for every value except INT32_MIN (special-cased to 5). 4 thresholds; xsimd's
+// unsigned-batch comparison handles the unsigned compare portably.
+FOLLY_ALWAYS_INLINE xsimd::batch<uint32_t> nullableInt32SizesBatch(
+    xsimd::batch<int32_t> v) {
+  using S = xsimd::batch<int32_t>;
+  using U = xsimd::batch<uint32_t>;
+  const S zero(0);
+  const S adj = v - S(v <= zero); // v > 0 ? v : v - 1
+  const U zz = xsimd::bitwise_cast<U>((adj << 1) ^ (adj >> 31)); // zigzag
+  U s(1);
+  s += U(zz > U(static_cast<uint32_t>((1 << 7) - 1)));
+  s += U(zz > U(static_cast<uint32_t>((1 << 14) - 1)));
+  s += U(zz > U(static_cast<uint32_t>((1 << 21) - 1)));
+  s += U(zz > U(static_cast<uint32_t>((1 << 28) - 1)));
+  return xsimd::select(
+      xsimd::batch_bool_cast<uint32_t>(
+          v == S(std::numeric_limits<int32_t>::min())),
+      U(5),
+      s);
+}
+
+// int64 size kernel, computed straight from |v| (no zigzag/adjust): size(v) =
+// min k with |v| < 2^(7k-1), i.e. threshold |v| against {2^6, 2^13, ... 2^62}.
+// The >=2^63 (10-byte) case is just the top threshold — no separate sign fixup.
+// INT64_MIN's abs overflows back to a negative value, so all thresholds miss
+// (s=1) and the final select sets it to the 2-byte sentinel. Branchless.
+//
+// This abs form measured ~20% faster than an equivalent zigzag-based kernel (9
+// magnitude thresholds replace zigzag's 8 thresholds + a zz<0 select, and they
+// sit on a shorter dependency chain since |v| is cheaper to derive than the
+// zigzag key), so it is the single int64 size kernel used everywhere.
+FOLLY_ALWAYS_INLINE xsimd::batch<int64_t> nullableInt64SizesBatch(
+    xsimd::batch<int64_t> v) {
+  using B = xsimd::batch<int64_t>;
+  const B sign = v >> 63;
+  const B m = (v ^ sign) - sign; // abs(v); INT64_MIN stays negative
+  // 9 magnitude thresholds (emulating a clz, which AVX2 can't vectorize). The
+  // serial `s +=` is not the bottleneck — the compiler reassociates these
+  // associative adds into a tree, and an explicit tree measured identically.
+  B s = B(1);
+  s += B(m > B((1LL << 6) - 1));
+  s += B(m > B((1LL << 13) - 1));
+  s += B(m > B((1LL << 20) - 1));
+  s += B(m > B((1LL << 27) - 1));
+  s += B(m > B((1LL << 34) - 1));
+  s += B(m > B((1LL << 41) - 1));
+  s += B(m > B((1LL << 48) - 1));
+  s += B(m > B((1LL << 55) - 1));
+  s += B(m > B((1LL << 62) - 1));
+  return xsimd::select(v == B(std::numeric_limits<int64_t>::min()), B(2), s);
+}
+
+// =============================================================================
+// L5 — Column-level loops: size sums and per-row scatters over whole arrays,
+// built on the L4 kernels with scalar (L3) tails.
+//
+// Narrow int8/int16 inputs widen to the int32 size kernel via xsimd's
+// converting load `batch<int32_t>::load_unaligned(const T*)`, which reads
+// batch<int32_t>::size narrow values and sign-extends each to an int32 lane.
+// It picks the right widening per ISA (AVX2/AVX-512/SSE/NEON) with no
+// target-specific intrinsics; for int32 input it is a plain load.
+// =============================================================================
+
+// Sum of nullable-int sizes for a contiguous non-null range of any of
+// int8/int16/int32/int64. Narrow types widen to the int32 kernel. int64 gets a
+// testz-style fast path: if every zigzag in a batch fits 32 bits (common
+// small-magnitude BIGINT) only 4 thresholds are needed; xsimd::all keeps it
+// portable.
+template <typename T>
+FOLLY_ALWAYS_INLINE size_t sumNullableIntSizes(const T* raw, size_t count) {
+  static_assert(
+      std::is_same_v<T, int8_t> || std::is_same_v<T, int16_t> ||
+          std::is_same_v<T, int32_t> || std::is_same_v<T, int64_t>,
+      "sumNullableIntSizes supports int8/int16/int32/int64");
+  if constexpr (std::is_same_v<T, int64_t>) {
+    using B = xsimd::batch<int64_t>;
+    constexpr std::size_t kBatchSize = B::size;
+    const B zero(0);
+    const B one(1);
+    B acc(0);
+    std::size_t j = 0;
+    for (; j + kBatchSize <= count; j += kBatchSize) {
+      const B v = B::load_unaligned(raw + j);
+      const B adj = v - B(v <= zero);
+      const B zz = (adj << 1) ^ (adj >> 63);
+      if (xsimd::all((zz >> 32) == zero)) {
+        B s = one;
+        s += B(zz > B((1LL << 7) - 1));
+        s += B(zz > B((1LL << 14) - 1));
+        s += B(zz > B((1LL << 21) - 1));
+        s += B(zz > B((1LL << 28) - 1));
+        acc += s;
+      } else {
+        acc += nullableInt64SizesBatch(v);
+      }
+    }
+    auto total = static_cast<size_t>(xsimd::reduce_add(acc));
+    for (; j < count; ++j) {
+      total += nullableInt64SerializedSize(raw[j], false);
+    }
+    return total;
+  } else {
+    using U = xsimd::batch<uint32_t>;
+    constexpr std::size_t kBatchSize = xsimd::batch<int32_t>::size;
+    U acc(0U);
+    std::size_t j = 0;
+    for (; j + kBatchSize <= count; j += kBatchSize) {
+      acc += nullableInt32SizesBatch(
+          xsimd::batch<int32_t>::load_unaligned(raw + j));
+    }
+    auto total = static_cast<size_t>(xsimd::reduce_add(acc));
+    for (; j < count; ++j) {
+      total += nullableInt64SerializedSize(static_cast<int64_t>(raw[j]), false);
+    }
+    return total;
+  }
+}
+
+// Per-row scatter for a non-null column: add each value's size into its own
+// rowSizes[r].
+template <typename T>
+FOLLY_ALWAYS_INLINE void
+addNoNullIntColumnSizes(const T* raw, size_t* rowSizes, size_t count) {
+  std::size_t j = 0;
+  if constexpr (std::is_same_v<T, int64_t>) {
+    // Sizes are int64 already: add the batch straight into rowSizes (portable).
+    // Branchless (no testz fastpath): a per-batch "all small" branch regresses
+    // mixed/full-range BIGINT via misprediction, which dominates the
+    // small-value saving.
+    using B = xsimd::batch<int64_t>;
+    constexpr std::size_t kWidth = B::size;
+    auto* rs = reinterpret_cast<int64_t*>(rowSizes);
+    for (; j + kWidth <= count; j += kWidth) {
+      B sz = nullableInt64SizesBatch(B::load_unaligned(raw + j));
+      (B::load_unaligned(rs + j) + sz).store_unaligned(rs + j);
+    }
+  } else {
+    constexpr std::size_t kWidth = xsimd::batch<int32_t>::size;
+    alignas(64) uint32_t sz[kWidth];
+    for (; j + kWidth <= count; j += kWidth) {
+      nullableInt32SizesBatch(xsimd::batch<int32_t>::load_unaligned(raw + j))
+          .store_aligned(sz);
+      for (std::size_t k = 0; k < kWidth; ++k) {
+        rowSizes[j + k] += sz[k];
+      }
+    }
+  }
+  // reset of rows
+  for (; j < count; ++j) {
+    rowSizes[j] +=
+        nullableInt64SerializedSize(static_cast<int64_t>(raw[j]), false);
+  }
+}
+
+// Overload for a FLAT int column WITH nulls (the common Spark case that
+// otherwise falls to the scalar loop). A null row contributes exactly 1 byte
+// (the 0x00 marker), independent of the (garbage) value stored at its slot.
+// `nulls` is the row-indexed validity bitmap (bit set = non-null), which the
+// caller guarantees non-null. Supports int8/int16/int32/int64; narrow types
+// widen to the int32 size kernel exactly like addNoNullIntColumnSizes.
+template <typename T>
+FOLLY_ALWAYS_INLINE void addNullableIntColumnSizes(
+    const T* raw,
+    const uint64_t* nulls,
+    size_t* rowSizes,
+    size_t count) {
+  std::size_t j = 0;
+  if constexpr (std::is_same_v<T, int64_t>) {
+    using B = xsimd::batch<int64_t>;
+    constexpr std::size_t kBatchSize = B::size;
+    // Validity bitmaps are addressed in 64-bit words.
+    constexpr std::size_t kWordBits = 64;
+    auto* rs = reinterpret_cast<int64_t*>(rowSizes);
+
+    // Per-lane bit selector {1<<0, 1<<1, ...}, built once.
+    int64_t selArr[kBatchSize];
+    for (std::size_t i = 0; i < kBatchSize; ++i) {
+      selArr[i] = static_cast<int64_t>(int64_t{1} << i);
+    }
+    const B laneSel = B::load_aligned(selArr);
+    const B one(1);
+
+    for (; j + kBatchSize <= count; j += kBatchSize) {
+      B sz = nullableInt64SizesBatch(B::load_unaligned(raw + j));
+      // kBatchSize validity bits for rows [j, j+kBatchSize). kBatchSize divides
+      // kWordBits and j is a multiple of kBatchSize, so the bits never straddle
+      // a word.
+      const uint64_t word = nulls[j / kWordBits];
+      const uint64_t validBits = (word >> (j % kWordBits)) &
+          bits::lowMask(static_cast<int32_t>(kBatchSize));
+      // lane i valid iff bit i set: (broadcast(validBits) & laneSel) ==
+      // laneSel.
+      const auto isValid =
+          (B(static_cast<int64_t>(validBits)) & laneSel) == laneSel;
+      sz = xsimd::select(isValid, sz, one); // null -> 1 byte
+      (B::load_unaligned(rs + j) + sz).store_unaligned(rs + j);
+    }
+  } else {
+    // Narrow int8/int16/int32: widen to the int32 size kernel, then override
+    // null lanes to 1 byte.
+    constexpr std::size_t kWidth = xsimd::batch<int32_t>::size;
+    uint32_t sz[kWidth];
+    for (; j + kWidth <= count; j += kWidth) {
+      nullableInt32SizesBatch(xsimd::batch<int32_t>::load_unaligned(raw + j))
+          .store_aligned(sz);
+      for (std::size_t k = 0; k < kWidth; ++k) {
+        const bool isNull = !bits::isBitSet(nulls, static_cast<int32_t>(j + k));
+        rowSizes[j + k] += isNull ? 1U : sz[k];
+      }
+    }
+  }
+  for (; j < count; ++j) {
+    const bool isNull = !bits::isBitSet(nulls, static_cast<int32_t>(j));
+    rowSizes[j] += nullableInt64SerializedSize(
+        isNull ? 0 : static_cast<int64_t>(raw[j]), isNull);
+  }
+}
+
+} // namespace bytedance::bolt::row::detail
diff --git a/bolt/row/tests/CMakeLists.txt b/bolt/row/tests/CMakeLists.txt
index 363687332..b2c8e0560 100644
--- a/bolt/row/tests/CMakeLists.txt
+++ b/bolt/row/tests/CMakeLists.txt
@@ -25,13 +25,15 @@
 # This modified file is released under the same license.
 # --------------------------------------------------------------------------
 
-add_executable(bolt_row_test CompactRowTest.cpp UnsafeRowTest.cpp)
+add_executable(bolt_row_test CompactRowTest.cpp UnsafeRowTest.cpp
+                             DenseRowTest.cpp)
 
 add_test(bolt_row_test bolt_row_test)
 
 target_link_libraries(
   bolt_row_test
-  PRIVATE bolt_testutils
+  PRIVATE bolt_row_fast
+          bolt_testutils
           Folly::folly
           GTest::gtest
           GTest::gtest_main
diff --git a/bolt/row/tests/DenseRowTest.cpp b/bolt/row/tests/DenseRowTest.cpp
new file mode 100644
index 000000000..db4f1f931
--- /dev/null
+++ b/bolt/row/tests/DenseRowTest.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) ByteDance Ltd. and/or its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "bolt/row/dense/DenseRow.h"
+#include "bolt/vector/fuzzer/VectorFuzzer.h"
+#include "bolt/vector/tests/utils/VectorTestBase.h"
+
+using namespace bytedance::bolt::test;
+
+namespace bytedance::bolt::row {
+namespace {
+
+class DenseRowTest : public ::testing::Test, public VectorTestBase {
+ protected:
+  static void SetUpTestCase() {
+    memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{});
+  }
+
+  // Serialize a RowVector into one contiguous buffer plus (N + 1) cumulative
+  // offsets — the test-side equivalent of how shuffle lays out a partition
+  // buffer from DenseRow::rowSizes() and then DenseRow::serialize()s into it.
+  struct Bytes {
+    std::vector<uint8_t> buffer;
+    std::vector<size_t> offsets; // size N + 1
+
+    std::string_view toView(size_t index) const {
+      return std::string_view(
+          reinterpret_cast<const char*>(buffer.data()) + offsets[index],
+          offsets[index + 1] - offsets[index]);
+    }
+
+    std::string toHex(size_t index) const {
+      auto view = toView(index);
+      std::string out;
+      out.reserve(view.size() * 2);
+      static constexpr char kHex[] = "0123456789abcdef";
+      for (unsigned char c : view) {
+        out.push_back(kHex[c >> 4]);
+        out.push_back(kHex[c & 0x0f]);
+      }
+      return out;
+    }
+  };
+
+  static Bytes serializeToBytes(const RowVectorPtr& input) {
+    DenseRow rows(input);
+    const auto n = rows.numRows();
+    Bytes out;
+    out.offsets.resize(n + 1);
+    size_t cum = 0;
+    for (vector_size_t r = 0; r < n; ++r) {
+      out.offsets[r] = cum;
+      cum += rows.rowSizes()[r];
+    }
+    out.offsets[n] = cum;
+    EXPECT_EQ(cum, rows.totalSize());
+    out.buffer.resize(std::max<size_t>(cum, 1));
+    rows.serialize(
+        out.buffer.data(), folly::Range<const size_t*>(out.offsets.data(), n));
+    return out;
+  }
+
+  // DenseRow is marker-less (no top-level null rows), so rebuild a null-free
+  // RowVector from the fuzzed input's children, then serialize -> split ->
+  // deserialize and compare.
+  void roundTrip(const RowVectorPtr& fuzzed) {
+    auto input = makeRowVector(fuzzed->children());
+    const auto rowType =
+        std::dynamic_pointer_cast<const RowType>(input->type());
+    ASSERT_NE(rowType, nullptr);
+    const auto n = input->size();
+
+    auto bytes = serializeToBytes(input);
+    std::vector<std::string_view> data(n);
+    for (vector_size_t r = 0; r < n; ++r) {
+      data[r] = bytes.toView(r);
+    }
+    auto out = DenseRow::deserialize(data, rowType, pool());
+    assertEqualVectors(input, out);
+  }
+
+  VectorPtr
+  fuzzVector(const TypePtr& type, vector_size_t size, uint32_t seed = 7) {
+    VectorFuzzer::Options opts;
+    opts.vectorSize = size;
+    opts.nullRatio = 0.2;
+    opts.dictionaryHasNulls = false;
+    opts.stringVariableLength = true;
+    opts.stringLength = 24;
+    opts.containerVariableLength = true;
+    opts.containerLength = 7;
+    opts.timestampPrecision =
+        VectorFuzzer::Options::TimestampPrecision::kMicroSeconds;
+
+    VectorFuzzer fuzzer(opts, pool(), seed);
+    return fuzzer.fuzzFlat(type, size);
+  }
+};
+
+TEST_F(DenseRowTest, dictionaryEncodedInput) {
+  // DenseRow decodes via DecodedVector (buildPlan), so dictionary-wrapped
+  // inputs round-trip.
+  auto base = makeFlatVector<int64_t>({100, 200, 300, 400});
+  auto indices = makeIndicesInReverse(4);
+  auto dict = BaseVector::wrapInDictionary(nullptr, indices, 4, base);
+  roundTrip(makeRowVector({dict}));
+}
+
+TEST_F(DenseRowTest, constantEncodedInput) {
+  // Constant-wrapped scalars decode via DecodedVector with isConstantMapping();
+  // the scalar encoder sizes them once and splats. Cover a non-null constant,
+  // a null constant, and a constant in a wide row alongside a flat column.
+  roundTrip(makeRowVector({makeConstant<int64_t>(987654321, 16)}));
+  roundTrip(makeRowVector({makeNullConstant(TypeKind::BIGINT, 16)}));
+  roundTrip(makeRowVector({
+      makeConstant<int32_t>(-7, 16),
+      makeConstant<int16_t>(123, 16),
+      makeNullConstant(TypeKind::TINYINT, 16),
+      makeFlatVector<int64_t>(16, [](auto r) { return r * 3 - 5; }),
+  }));
+}
+
+TEST_F(DenseRowTest, rowOfScalars) {
+  roundTrip(std::dynamic_pointer_cast<RowVector>(fuzzVector(
+      ROW({BIGINT(), INTEGER(), REAL(), DOUBLE(), VARCHAR()}), 128, 11)));
+}
+
+TEST_F(DenseRowTest, multiScalarWideRow) {
+  // 10-column flat ROW covering every supported scalar leaf encoder.
+  auto type = ROW({
+      BIGINT(),
+      INTEGER(),
+      SMALLINT(),
+      TINYINT(),
+      BOOLEAN(),
+      REAL(),
+      DOUBLE(),
+      VARCHAR(),
+      TIMESTAMP(),
+      BIGINT(),
+  });
+  roundTrip(std::dynamic_pointer_cast<RowVector>(fuzzVector(type, 256, 17)));
+}
+
+TEST_F(DenseRowTest, bigintEdges) {
+  auto bigint = makeFlatVector<int64_t>({
+      std::numeric_limits<int64_t>::min(),
+      std::numeric_limits<int64_t>::max(),
+      -1,
+      0,
+      1,
+  });
+  roundTrip(makeRowVector({bigint}));
+}
+
+// HUGEINT (128-bit, used by DECIMAL(precision > 18, *)). Cover null, zero,
+// small, negative, and INT128 edges.
+TEST_F(DenseRowTest, hugeintEdges) {
+  using int128_t = __int128_t;
+  const int128_t kMax = (int128_t{1} << 126) + ((int128_t{1} << 126) - 1);
+  const int128_t kMin = -kMax - 1;
+  auto values = makeNullableFlatVector<int128_t>(
+      {kMin,
+       kMax,
+       int128_t{-1},
+       int128_t{0},
+       int128_t{1},
+       std::nullopt,
+       int128_t{1234567890123456789LL}});
+  roundTrip(makeRowVector({values}));
+}
+
+// 16-column "Mix" schema from the production shuffle matrix tests: every
+// supported scalar plus ARRAY/MAP/ROW.
+TEST_F(DenseRowTest, mixWideRow) {
+  auto type = ROW({
+      BOOLEAN(),
+      TINYINT(),
+      SMALLINT(),
+      INTEGER(),
+      BIGINT(),
+      DECIMAL(10, 2),
+      DECIMAL(38, 18),
+      REAL(),
+      DOUBLE(),
+      VARCHAR(),
+      VARBINARY(),
+      DATE(),
+      TIMESTAMP(),
+      ARRAY(INTEGER()),
+      MAP(VARCHAR(), BIGINT()),
+      ROW({INTEGER(), VARCHAR()}),
+  });
+  roundTrip(std::dynamic_pointer_cast<RowVector>(fuzzVector(type, 256, 41)));
+}
+
+// A top-level ROW whose nested-ROW child is dictionary-wrapped
+TEST_F(DenseRowTest, dictionaryWrappedNestedRow) {
+  auto innerInts = makeFlatVector<int32_t>({100, 200, 300, 400});
+  auto innerStrs = makeFlatVector<StringView>({"aaa", "bbb", "ccc", "ddd"});
+  auto baseNestedRow = makeRowVector({innerInts, innerStrs});
+
+  const std::vector<vector_size_t> dictIndices = {3, 0, 2, 1, 0, 3};
+  auto indicesBuf =
+      AlignedBuffer::allocate<vector_size_t>(dictIndices.size(), pool());
+  std::memcpy(
+      indicesBuf->asMutable<vector_size_t>(),
+      dictIndices.data(),
+      dictIndices.size() * sizeof(vector_size_t));
+  auto dictNestedRow = BaseVector::wrapInDictionary(
+      nullptr,
+      indicesBuf,
+      static_cast<vector_size_t>(dictIndices.size()),
+      baseNestedRow);
+
+  auto bigintCol = makeFlatVector<int64_t>({10, 20, 30, 40, 50, 60});
+  roundTrip(makeRowVector({bigintCol, dictNestedRow}));
+}
+
+TEST_F(DenseRowTest, arrayOfBigint) {
+  roundTrip(std::dynamic_pointer_cast<RowVector>(
+      fuzzVector(ROW({ARRAY(BIGINT())}), 128, 12)));
+}
+
+TEST_F(DenseRowTest, arrayOfArrayOfBigint) {
+  roundTrip(std::dynamic_pointer_cast<RowVector>(
+      fuzzVector(ROW({ARRAY(ARRAY(BIGINT()))}), 256, 13)));
+}
+
+TEST_F(DenseRowTest, mapBigintReal) {
+  roundTrip(std::dynamic_pointer_cast<RowVector>(
+      fuzzVector(ROW({MAP(BIGINT(), REAL())}), 128, 14)));
+}
+
+TEST_F(DenseRowTest, nestedRowOfMixedFields) {
+  auto type = ROW({
+      BIGINT(),
+      ARRAY(VARCHAR()),
+      MAP(INTEGER(), ARRAY(BIGINT())),
+      ROW({INTEGER(), VARCHAR()}),
+  });
+  roundTrip(std::dynamic_pointer_cast<RowVector>(fuzzVector(type, 128, 15)));
+}
+
+TEST_F(DenseRowTest, emptyContainers) {
+  auto input = makeRowVector({
+      makeArrayVector<int64_t>({{}, {}, {}}),
+      makeMapVector<int32_t, StringView>({{}, {}, {}}),
+      makeNestedArrayVectorFromJson<int64_t>({"[]", "[[]]", "[]"}),
+  });
+  roundTrip(input);
+}
+
+// Golden bytes pin the (marker-less) level-hoisted wire for
+// ARRAY<ARRAY<BIGINT>>. Row 0: [[1,2,3],[4,5,6]]; row 1: [[7],[8,9]].
+TEST_F(DenseRowTest, goldenBytesNestedArrays) {
+  auto input = makeRowVector({
+      makeNestedArrayVectorFromJson<int64_t>(
+          {"[[1,2,3],[4,5,6]]", "[[7],[8,9]]"}),
+  });
+  auto bytes = serializeToBytes(input);
+  // Row 0: 03 (outer=2+1) | 04 04 (inner=3+1,3+1) | 02 04 06 08 0a 0c (zz 1..6)
+  EXPECT_EQ(bytes.toHex(0), "030404020406080a0c");
+  // Row 1: 03 (outer) | 02 03 (inner=1+1,2+1) | 0e 10 12 (zz 7,8,9)
+  EXPECT_EQ(bytes.toHex(1), "0302030e1012");
+}
+
+// Golden bytes for MAP<BIGINT, REAL> with hoisted key/value segments.
+// Row 0: {1 -> 1.5, 2 -> 2.5}.
+TEST_F(DenseRowTest, goldenBytesMapHoistedKV) {
+  auto input = makeRowVector({
+      makeMapVector<int64_t, float>({{{1, 1.5f}, {2, 2.5f}}}),
+  });
+  auto bytes = serializeToBytes(input);
+  // 03 (card=2+1) | 02 04 (keys zz 1,2) | 0000c03f 00002040 (1.5f, 2.5f LE)
+  EXPECT_EQ(
+      bytes.toHex(0),
+      "030204"
+      "0000c03f"
+      "00002040");
+}
+
+// Golden bytes for the top-level all-scalar ROW shape (the slot-free fast
+// path). Per-row layout (marker-less):
+// [bigint][int][varchar_len+1|payload][real].
+TEST_F(DenseRowTest, goldenBytesScalarRow) {
+  auto type = ROW({BIGINT(), INTEGER(), VARCHAR(), REAL()});
+  auto bigint = makeFlatVector<int64_t>({1, -1});
+  auto integer = makeNullableFlatVector<int32_t>({2, std::nullopt});
+  auto varchar = makeFlatVector<StringView>({"ab", ""});
+  auto real = makeNullableFlatVector<float>({1.5f, std::nullopt});
+  auto input = makeRowVector({bigint, integer, varchar, real});
+
+  auto bytes = serializeToBytes(input);
+  // Row 0: bigint zz(1)=02, int zz(2)=04, varchar(len=2,"ab")=03 6162,
+  // real 1.5f bits 0x3fc00000 LE = 0000c03f.
+  EXPECT_EQ(
+      bytes.toHex(0),
+      "0204"
+      "036162"
+      "0000c03f");
+  // Row 1: bigint zz(adjust(-1))=zz(-2)=3 -> 03, int null=00,
+  // varchar(len=0)=01, real null = kNullFloatBits LE = 0000c07f.
+  EXPECT_EQ(
+      bytes.toHex(1),
+      "03"
+      "00"
+      "01"
+      "0000c07f");
+
+  // Round-trip restores the original.
+  std::vector<std::string_view> rows(2);
+  for (vector_size_t r = 0; r < 2; ++r) {
+    rows[r] = bytes.toView(r);
+  }
+  assertEqualVectors(input, DenseRow::deserialize(rows, type, pool()));
+}
+
+// Drive serialize() with reverse-order, gapped destination offsets to confirm
+// each row's bytes land exactly where the offset table says and nowhere else.
+TEST_F(DenseRowTest, serializeAtNonContiguousOffsets) {
+  auto type = ROW({BIGINT(), VARCHAR(), ARRAY(INTEGER())});
+  VectorFuzzer::Options opts;
+  opts.vectorSize = 8;
+  opts.nullRatio = 0.0;
+  opts.stringLength = 12;
+  opts.containerLength = 4;
+  VectorFuzzer fuzzer(opts, pool(), 41);
+  auto input = std::dynamic_pointer_cast<RowVector>(
+      fuzzer.fuzzFlat(type, opts.vectorSize));
+  const auto rowType = std::dynamic_pointer_cast<const RowType>(input->type());
+  const auto n = input->size();
+
+  DenseRow rows(input);
+  const auto& sizes = rows.rowSizes();
+
+  // Reverse buffer order, 7-byte gaps pre-filled with 0xCC.
+  constexpr size_t kGap = 7;
+  std::vector<size_t> offsets(n);
+  size_t cum = 0;
+  for (vector_size_t r = 0; r < n; ++r) {
+    const auto srcRow = static_cast<vector_size_t>(n - 1 - r);
+    offsets[srcRow] = cum;
+    cum += sizes[srcRow] + kGap;
+  }
+  std::vector<uint8_t> buffer(cum, /*fill=*/0xCC);
+
+  rows.serialize(
+      buffer.data(),
+      folly::Range<const size_t*>(offsets.data(), offsets.size()));
+
+  std::vector<bool> claimed(cum, false);
+  for (vector_size_t r = 0; r < n; ++r) {
+    for (uint32_t i = offsets[r]; i < offsets[r] + sizes[r]; ++i) {
+      claimed[i] = true;
+    }
+  }
+  for (uint32_t i = 0; i < cum; ++i) {
+    if (!claimed[i]) {
+      EXPECT_EQ(buffer[i], 0xCC) << "gap byte at " << i << " was overwritten";
+    }
+  }
+
+  std::vector<std::string_view> data(n);
+  for (vector_size_t r = 0; r < n; ++r) {
+    data[r] = std::string_view(
+        reinterpret_cast<const char*>(buffer.data() + offsets[r]), sizes[r]);
+  }
+  assertEqualVectors(input, DenseRow::deserialize(data, rowType, pool()));
+}
+
+} // namespace
+} // namespace bytedance::bolt::row
diff --git a/bolt/shuffle/sparksql/BoltRowBasedSortShuffleWriter.cpp b/bolt/shuffle/sparksql/BoltRowBasedSortShuffleWriter.cpp
index 92f7fd4c6..873559ed2 100644
--- a/bolt/shuffle/sparksql/BoltRowBasedSortShuffleWriter.cpp
+++ b/bolt/shuffle/sparksql/BoltRowBasedSortShuffleWriter.cpp
@@ -129,7 +129,10 @@ arrow::Status BoltRowBasedSortShuffleWriter::split(
           pidArr, rv->size(), row2Partition_, partition2RowCount_));
       strippedRv = getStrippedRowVectorWrapper(*rv);
     }
-    auto rowVectorWithStats = rowConverter_->getWithStats(strippedRv);
+    auto rowVectorWithStats = [&]() {
+      bytedance::bolt::NanosecondTimer timer(&convertTime_);
+      return rowConverter_->getWithStats(strippedRv);
+    }();
     if (!boltPool_->maybeReserve(rowVectorWithStats.getTotalMemorySize())) {
       if (boltPool_->reservedBytes() >= kMinMemLimit) {
         RETURN_NOT_OK(tryEvict());
@@ -160,8 +163,8 @@ arrow::Status BoltRowBasedSortShuffleWriter::initFromRowVector(
     const bytedance::bolt::RowVector& rv) {
   // rv is not stripped
   auto&& rowType = getStrippedRowVectorType(rv);
-  rowConverter_ =
-      std::make_unique<ShuffleColumnarToRowConverter>(rowType, boltPool_);
+  rowConverter_ = std::make_unique<ShuffleColumnarToRowConverter>(
+      rowType, boltPool_, options_.rowFormat);
   sortedRows_.resize(numPartitions_);
   partitionBytes_.resize(numPartitions_, 0);
   return arrow::Status::OK();
diff --git a/bolt/shuffle/sparksql/BoltShuffleReader.cpp b/bolt/shuffle/sparksql/BoltShuffleReader.cpp
index 6a5c6d045..760ff89de 100644
--- a/bolt/shuffle/sparksql/BoltShuffleReader.cpp
+++ b/bolt/shuffle/sparksql/BoltShuffleReader.cpp
@@ -763,8 +763,8 @@ BoltColumnarBatchDeserializerFactory::createDeserializer(
     zstdCodec_ = std::make_shared<AdaptiveParallelZstdCodec>(
         1 /*not used*/, false, memoryPool_, checksumEnabled_);
     rowBufferPool_ = std::make_shared<RowBufferPool>(memoryPool_);
-    row2ColConverter_ =
-        std::make_shared<ShuffleRowToColumnarConverter>(rowType_, boltPool_);
+    row2ColConverter_ = std::make_shared<ShuffleRowToColumnarConverter>(
+        rowType_, boltPool_, rowFormat_);
   }
   return std::make_unique<BoltColumnarBatchDeserializer>(
       std::move(in),
@@ -867,6 +867,7 @@ BoltShuffleReader::BoltShuffleReader(
   factory_->setNumPartitions(options.numPartitions);
   factory_->setShuffleWriterType(options.forceShuffleWriterType);
   factory_->setpartitioningShortName(options.partitionShortName);
+  factory_->setRowFormat(options.rowFormat);
 }
 
 } // namespace bytedance::bolt::shuffle::sparksql
diff --git a/bolt/shuffle/sparksql/BoltShuffleReader.h b/bolt/shuffle/sparksql/BoltShuffleReader.h
index 624e18c07..85207a1a2 100644
--- a/bolt/shuffle/sparksql/BoltShuffleReader.h
+++ b/bolt/shuffle/sparksql/BoltShuffleReader.h
@@ -217,6 +217,10 @@ class BoltColumnarBatchDeserializerFactory {
     partitioningShortName_ = name;
   }
 
+  void setRowFormat(bytedance::bolt::row::RowFormat rowFormat) {
+    rowFormat_ = rowFormat;
+  }
+
  private:
   std::shared_ptr<arrow::Schema> schema_;
   std::shared_ptr<Codec> codec_;
@@ -226,6 +230,8 @@ class BoltColumnarBatchDeserializerFactory {
   int32_t numPartitions_{0};
   ShuffleWriterType shuffleWriterType_{ShuffleWriterType::V1};
   std::string partitioningShortName_;
+  bytedance::bolt::row::RowFormat rowFormat_{
+      bytedance::bolt::row::RowFormat::COMPACT};
   arrow::MemoryPool* memoryPool_;
   bytedance::bolt::memory::MemoryPool* boltPool_;
 
diff --git a/bolt/shuffle/sparksql/Options.h b/bolt/shuffle/sparksql/Options.h
index bcc10dbb1..358a26b74 100644
--- a/bolt/shuffle/sparksql/Options.h
+++ b/bolt/shuffle/sparksql/Options.h
@@ -36,6 +36,7 @@
 #include <bolt/common/base/Exceptions.h>
 #include <fmt/format.h>
 #include <cstdint>
+#include "bolt/row/RowFormat.h"
 #include "bolt/shuffle/sparksql/compression/Codec.h"
 #include "bolt/shuffle/sparksql/partition_writer/rss/RssClient.h"
 #include "bolt/shuffle/sparksql/partitioner/Partitioning.h"
@@ -102,6 +103,9 @@ struct ShuffleReaderOptions {
   std::string partitionShortName = "";
   int32_t forceShuffleWriterType = -1;
 
+  // On-wire row format for the row-based shuffle. Must match the writer side.
+  row::RowFormat rowFormat = row::RowFormat::COMPACT;
+
   // Enable checksum in codec for shuffle data corruption detection
   bool checksumEnabled = true;
 };
@@ -160,6 +164,7 @@ struct ShuffleWriterOptions {
   int32_t recommendedColumn2RowSize = 0;
   double shuffleCheckRatio = 0;
   int32_t shuffleCheckMaxColumns = kDefaultShuffleCheckMaxColumns;
+  row::RowFormat rowFormat = row::RowFormat::COMPACT;
   PartitionWriterOptions partitionWriterOptions{};
 };
 
diff --git a/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.cpp b/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.cpp
index 801470419..86373d20c 100644
--- a/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.cpp
+++ b/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.cpp
@@ -31,36 +31,46 @@
 
 #include "bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h"
 #include <bolt/common/base/SuccinctPrinter.h>
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
+#include <limits>
 
 #include "bolt/row/CompactRow.h"
+#include "bolt/row/dense/DenseRow.h"
 using namespace bytedance;
 namespace bytedance::bolt::shuffle::sparksql {
 
 void ShuffleColumnarToRowConverter::init(
     const bytedance::bolt::RowTypePtr& rowType) {
-  if (auto fixedRowSize = bolt::row::CompactRow::fixedRowSize(rowType)) {
-    fixedRowSize_ = fixedRowSize.value();
+  if (rowFormat_ == row::RowFormat::COMPACT) {
+    if (auto fixedRowSize = bolt::row::CompactRow::fixedRowSize(rowType)) {
+      fixedRowSize_ = fixedRowSize.value();
+    }
   }
 }
-
 ShuffleColumnarToRowConverter::RowVectorWithStats
 ShuffleColumnarToRowConverter::getWithStats(
     const bytedance::bolt::RowVectorPtr& rowVector) {
   RowVectorWithStats stats;
-  stats.compactRow = std::make_shared<bolt::row::CompactRow>(rowVector);
   stats.numRows = rowVector->size();
   stats.totalMemorySize = 0;
   auto numRows = rowVector->size();
-  if (fixedRowSize_) {
-    stats.totalMemorySize = fixedRowSize_ * numRows;
-  } else {
-    for (auto i = 0; i < numRows; ++i) {
-      stats.totalMemorySize += stats.compactRow->rowSize(i);
+  if (rowFormat_ == row::RowFormat::COMPACT) {
+    stats.compactRow = std::make_unique<row::CompactRow>(rowVector);
+    if (fixedRowSize_) {
+      stats.totalMemorySize = fixedRowSize_ * numRows;
+    } else {
+      for (auto i = 0; i < numRows; ++i) {
+        stats.totalMemorySize += stats.compactRow->rowSize(i);
+      }
     }
+  } else {
+    stats.denseRow = std::make_unique<row::DenseRow>(rowVector);
+    stats.totalMemorySize = static_cast<int64_t>(stats.denseRow->totalSize());
   }
-  // layout : rowSize | unsafeRow
+  // layout : rowSize | rowData
   stats.totalMemorySize += numRows * kSizeOfRowHeader;
   return stats;
 }
@@ -70,13 +80,33 @@ void ShuffleColumnarToRowConverter::convert(
     const std::vector<uint32_t>& indexes,
     std::vector<std::vector<uint8_t*>>& sortedRows,
     std::vector<int64_t>& partitionBytes) {
-  auto numRows = rowVector.numRows;
+  const auto numRows = rowVector.numRows;
   totalBufferSize_ += rowVector.totalMemorySize;
   boltBuffers_.emplace_back(
       RowInternalBuffer::allocate(rowVector.totalMemorySize, boltPool_));
   bufferAddress_ = boltBuffers_.back()->mutable_data();
-  memset(bufferAddress_, 0, sizeof(int8_t) * rowVector.totalMemorySize);
   averageRowSize_ = numRows ? (rowVector.totalMemorySize / numRows) : 0;
+
+  if (rowFormat_ == row::RowFormat::DENSE) {
+    const std::vector<size_t>& rowSizesVec = rowVector.denseRow->rowSizes();
+    std::vector<size_t> bodyOffsets(numRows);
+    uint32_t cursor = 0;
+    for (int64_t r = 0; r < numRows; ++r) {
+      const auto rowSize = static_cast<int32_t>(rowSizesVec[r]);
+      *reinterpret_cast<int32_t*>(bufferAddress_ + cursor) = rowSize;
+      bodyOffsets[r] = cursor + kSizeOfRowHeader;
+      sortedRows[indexes[r]].push_back(bufferAddress_ + cursor);
+      partitionBytes[indexes[r]] += rowSize + kSizeOfRowHeader;
+      cursor += static_cast<uint32_t>(rowSize) + kSizeOfRowHeader;
+    }
+
+    rowVector.denseRow->serialize(
+        bufferAddress_,
+        folly::Range<const size_t*>(bodyOffsets.data(), bodyOffsets.size()));
+    return;
+  }
+
+  std::memset(bufferAddress_, 0, rowVector.totalMemorySize);
   size_t offset = kSizeOfRowHeader;
   for (auto i = 0; i < numRows; ++i) {
     auto rowSize =
diff --git a/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h b/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h
index 323b2ba37..66191e582 100644
--- a/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h
+++ b/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h
@@ -34,8 +34,14 @@
 #include <arrow/memory_pool.h>
 #include <arrow/type.h>
 
+#include <memory>
+#include <optional>
+#include <vector>
+
 #include "bolt/buffer/Buffer.h"
 #include "bolt/row/CompactRow.h"
+#include "bolt/row/RowFormat.h"
+#include "bolt/row/dense/DenseRow.h"
 #include "bolt/vector/ComplexVector.h"
 namespace bytedance::bolt::shuffle::sparksql {
 static const uint32_t kSizeOfRowHeader = sizeof(int32_t);
@@ -82,8 +88,10 @@ class ShuffleColumnarToRowConverter {
  public:
   explicit ShuffleColumnarToRowConverter(
       const bytedance::bolt::RowTypePtr& rowType,
-      bytedance::bolt::memory::MemoryPool* boltPool)
-      : boltPool_(boltPool) {
+      bytedance::bolt::memory::MemoryPool* boltPool,
+      bytedance::bolt::row::RowFormat rowFormat =
+          bytedance::bolt::row::RowFormat::COMPACT)
+      : boltPool_(boltPool), rowFormat_(rowFormat) {
     init(rowType);
   }
 
@@ -96,9 +104,10 @@ class ShuffleColumnarToRowConverter {
     }
 
    private:
-    std::shared_ptr<bytedance::bolt::row::CompactRow> compactRow;
-    int64_t numRows;
-    int64_t totalMemorySize;
+    std::unique_ptr<row::CompactRow> compactRow;
+    std::unique_ptr<row::DenseRow> denseRow;
+    int64_t numRows{0};
+    int64_t totalMemorySize{0};
   };
 
   RowVectorWithStats getWithStats(
@@ -125,12 +134,12 @@ class ShuffleColumnarToRowConverter {
 
  private:
   void init(const bytedance::bolt::RowTypePtr& rowType);
-
   int32_t fixedRowSize_ = 0;
   uint8_t* bufferAddress_;
   int64_t totalBufferSize_{0};
   size_t averageRowSize_{0};
   bytedance::bolt::memory::MemoryPool* boltPool_;
+  bytedance::bolt::row::RowFormat rowFormat_;
   std::vector<RowInternalBufferPtr> boltBuffers_;
 };
 
diff --git a/bolt/shuffle/sparksql/ShuffleReaderNode.cpp b/bolt/shuffle/sparksql/ShuffleReaderNode.cpp
index 8e8272cf6..c7f652ca6 100644
--- a/bolt/shuffle/sparksql/ShuffleReaderNode.cpp
+++ b/bolt/shuffle/sparksql/ShuffleReaderNode.cpp
@@ -46,7 +46,8 @@ SparkShuffleReader::SparkShuffleReader(
       rowBufferPool_(std::make_shared<RowBufferPool>(arrowPool_.get())),
       row2ColConverter_(std::make_shared<ShuffleRowToColumnarConverter>(
           outputType_,
-          pool())) {
+          pool(),
+          shuffleReaderOptions_.rowFormat)) {
   isValidityBuffer_.reserve(outputType_->size());
   for (size_t i = 0; i < outputType_->size(); ++i) {
     switch (outputType_->childAt(i)->kind()) {
diff --git a/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.cpp b/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.cpp
index 3979bbf26..e2d0c4000 100644
--- a/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.cpp
+++ b/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.cpp
@@ -31,18 +31,22 @@
 
 #include "bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h"
 #include "bolt/row/CompactRow.h"
+#include "bolt/row/dense/DenseRow.h"
 #include "bolt/vector/arrow/Bridge.h"
 using namespace bytedance::bolt;
 namespace bytedance::bolt::shuffle::sparksql {
 ShuffleRowToColumnarConverter::ShuffleRowToColumnarConverter(
     const bytedance::bolt::RowTypePtr& rowType,
-    memory::MemoryPool* memoryPool)
-    : rowType_(rowType), pool_(memoryPool) {}
+    memory::MemoryPool* memoryPool,
+    bytedance::bolt::row::RowFormat rowFormat)
+    : rowType_(rowType), pool_(memoryPool), rowFormat_(rowFormat) {}
 
 RowVectorPtr ShuffleRowToColumnarConverter::convert(
     std::vector<std::string_view>& rows) {
-  auto vp = row::CompactRow::deserialize(rows, rowType_, pool_);
-  return std::dynamic_pointer_cast<RowVector>(vp);
+  if (rowFormat_ == row::RowFormat::COMPACT) {
+    return row::CompactRow::deserialize(rows, rowType_, pool_);
+  }
+  return row::DenseRow::deserialize(rows, rowType_, pool_);
 }
 
 RowVectorPtr ShuffleRowToColumnarConverter::convertToComposite(
diff --git a/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h b/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h
index 74f3e31f7..0eec9b791 100644
--- a/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h
+++ b/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h
@@ -33,6 +33,7 @@
 
 #include <arrow/c/abi.h>
 #include "bolt/common/memory/Memory.h"
+#include "bolt/row/RowFormat.h"
 #include "bolt/type/Type.h"
 #include "bolt/vector/ComplexVector.h"
 namespace bytedance::bolt::shuffle::sparksql {
@@ -41,7 +42,9 @@ class ShuffleRowToColumnarConverter {
  public:
   ShuffleRowToColumnarConverter(
       const bytedance::bolt::RowTypePtr& rowType,
-      bytedance::bolt::memory::MemoryPool* memoryPool);
+      bytedance::bolt::memory::MemoryPool* memoryPool,
+      bytedance::bolt::row::RowFormat rowFormat =
+          bytedance::bolt::row::RowFormat::COMPACT);
 
   bytedance::bolt::RowVectorPtr convert(std::vector<std::string_view>& rows);
 
@@ -52,6 +55,7 @@ class ShuffleRowToColumnarConverter {
  protected:
   bytedance::bolt::RowTypePtr rowType_;
   bytedance::bolt::memory::MemoryPool* pool_;
+  bytedance::bolt::row::RowFormat rowFormat_;
 };
 
 } // namespace bytedance::bolt::shuffle::sparksql
diff --git a/bolt/shuffle/sparksql/tests/ShuffleMatrixTest.cpp b/bolt/shuffle/sparksql/tests/ShuffleMatrixTest.cpp
index 86415e7db..993552c46 100644
--- a/bolt/shuffle/sparksql/tests/ShuffleMatrixTest.cpp
+++ b/bolt/shuffle/sparksql/tests/ShuffleMatrixTest.cpp
@@ -42,7 +42,16 @@ std::vector<ShuffleTestParam> buildShuffleParams() {
                   dataTypeGroup,
                   numPartitions,
                   numMappers};
-              if (param.isSupported()) {
+              if (!param.isSupported()) {
+                continue;
+              }
+              if (shuffleMode == 3) {
+                // RowBased: round-trip both on-wire row formats.
+                param.rowFormat = bytedance::bolt::row::RowFormat::DENSE;
+                params.push_back(param);
+                param.rowFormat = bytedance::bolt::row::RowFormat::COMPACT;
+                params.push_back(param);
+              } else {
                 params.push_back(param);
               }
             }
diff --git a/bolt/shuffle/sparksql/tests/ShuffleTestBase.cpp b/bolt/shuffle/sparksql/tests/ShuffleTestBase.cpp
index badaa582d..5eb1a8a4d 100644
--- a/bolt/shuffle/sparksql/tests/ShuffleTestBase.cpp
+++ b/bolt/shuffle/sparksql/tests/ShuffleTestBase.cpp
@@ -161,14 +161,16 @@ std::string ShuffleTestParam::toString() const {
   auto memStr = fmt::format("{}{}", v, units[u]);
 
   return fmt::format(
-      "{}_{}_{}_{}_M{}_P{}_{}",
+      "{}_{}_{}_{}_M{}_P{}_{}_{}",
       partitioning,
       shuffleModeToString(shuffleMode),
       writerTypeToString(writerType),
       dataTypeGroupToString(dataTypeGroup),
       numMappers,
       numPartitions,
-      memStr);
+      memStr,
+      rowFormat == bytedance::bolt::row::RowFormat::COMPACT ? "Compact"
+                                                            : "Dense");
 }
 
 bool ShuffleTestParam::isSupported() const {
@@ -513,6 +515,7 @@ ShuffleRunResult ShuffleTestBase::runShuffle(
     writerOptions.partitioning = toPartitioning(param.partitioning);
     writerOptions.partitionWriterOptions.numPartitions = param.numPartitions;
     writerOptions.forceShuffleWriterType = param.shuffleMode;
+    writerOptions.rowFormat = param.rowFormat;
     writerOptions.partitionWriterOptions.partitionWriterType = param.writerType;
     writerOptions.taskAttemptId = memoryManagerHolder->taskAttemptId();
     writerOptions.partitionWriterOptions.shuffleBufferSize =
@@ -640,6 +643,7 @@ ShuffleRunResult ShuffleTestBase::runShuffle(
     ShuffleReaderOptions readerOptions;
     readerOptions.numPartitions = param.numPartitions;
     readerOptions.forceShuffleWriterType = param.shuffleMode;
+    readerOptions.rowFormat = param.rowFormat;
     readerOptions.partitionShortName = param.partitioning;
     readerOptions.shuffleBatchByteSize = 1024 * 1024; // 1MB
 
diff --git a/bolt/shuffle/sparksql/tests/ShuffleTestBase.h b/bolt/shuffle/sparksql/tests/ShuffleTestBase.h
index 204b265ff..f0552a190 100644
--- a/bolt/shuffle/sparksql/tests/ShuffleTestBase.h
+++ b/bolt/shuffle/sparksql/tests/ShuffleTestBase.h
@@ -67,6 +67,9 @@ struct ShuffleTestParam {
   int32_t numBatches = 4;
   int32_t shuffleBufferSize = kDefaultShuffleWriterBufferSize;
   bool verifyOutput = true;
+  // On-wire row format for the RowBased writer (shuffleMode == 3); ignored by
+  // the other modes. Threaded into both writer and reader options.
+  row::RowFormat rowFormat = bytedance::bolt::row::RowFormat::DENSE;
 
   std::string toString() const;