diff --git a/Makefile b/Makefile index 4b2d1a8d5..5b480e53b 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ CONAN_CONFIG ?= CONAN_OVERRIDE ?= BUILD_VERSION ?= main -PROFILE=default +PROFILE ?= default BUILD_TYPE=Release # Note that, `benchmarks` and `test coverage` shouldn't be included in conan's options/configs, diff --git a/bolt/row/CMakeLists.txt b/bolt/row/CMakeLists.txt index e840e41ea..a06025be5 100644 --- a/bolt/row/CMakeLists.txt +++ b/bolt/row/CMakeLists.txt @@ -25,7 +25,15 @@ # This modified file is released under the same license. # -------------------------------------------------------------------------- -bolt_add_library(bolt_row_fast CompactRow.cpp UnsafeRowFast.cpp) +bolt_add_library( + bolt_row_fast + CompactRow.cpp + UnsafeRowFast.cpp + dense/DenseRow.cpp + dense/DenseRowGeneralEncode.cpp + dense/DenseRowGeneralDecode.cpp + dense/DenseRowScalarEncode.cpp + dense/DenseRowScalarDecode.cpp) target_link_libraries(bolt_row_fast PUBLIC bolt_vector) diff --git a/bolt/row/RowFormat.h b/bolt/row/RowFormat.h new file mode 100644 index 000000000..0029a9a1a --- /dev/null +++ b/bolt/row/RowFormat.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace bytedance::bolt::row { + +enum class RowFormat : uint8_t { + DENSE = 0, + COMPACT = 1, +}; + +} // namespace bytedance::bolt::row diff --git a/bolt/row/benchmark/CMakeLists.txt b/bolt/row/benchmark/CMakeLists.txt index bb6dd5fc1..a9578b4e7 100644 --- a/bolt/row/benchmark/CMakeLists.txt +++ b/bolt/row/benchmark/CMakeLists.txt @@ -14,9 +14,20 @@ # limitations under the License. add_executable(unsafe_row_serialize_benchmark UnsafeRowSerializeBenchmark.cpp) +add_executable(dense_row_serialize_benchmark DenseRowSerializeBenchmark.cpp) target_link_libraries( unsafe_row_serialize_benchmark + bolt_row_fast + bolt_vector_fuzzer + bolt_testutils + ${FOLLY_BENCHMARK} + GTest::gtest +) + +target_link_libraries( + dense_row_serialize_benchmark + bolt_row_fast bolt_vector_fuzzer bolt_testutils ${FOLLY_BENCHMARK} diff --git a/bolt/row/benchmark/DenseRowSerializeBenchmark.cpp b/bolt/row/benchmark/DenseRowSerializeBenchmark.cpp new file mode 100644 index 000000000..9d18a0c4a --- /dev/null +++ b/bolt/row/benchmark/DenseRowSerializeBenchmark.cpp @@ -0,0 +1,913 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bolt/row/CompactRow.h" +#include "bolt/row/UnsafeRowDeserializers.h" +#include "bolt/row/UnsafeRowFast.h" +#include "bolt/row/dense/DenseRow.h" +#include "bolt/row/dense/DenseRowScalar.h" +#include "bolt/row/dense/IntVarint.h" +#include "bolt/vector/FlatVector.h" +#include "bolt/vector/fuzzer/VectorFuzzer.h" + +namespace bytedance::bolt::row { +namespace { + +// Serialize a RowVector to one contiguous buffer + (N + 1) cumulative row +// offsets via DenseRow (the self-allocating shape the benchmark relies on). +// DenseRow is marker-less, so strip any top-level nulls first. +struct DenseSerialized { + BufferPtr buffer; + BufferPtr rowOffsets; +}; + +DenseSerialized denseSerialize( + const RowVectorPtr& data, + memory::MemoryPool* pool) { + RowVectorPtr input = data; + if (data->mayHaveNulls()) { + input = std::make_shared( + pool, data->type(), /*nulls=*/nullptr, data->size(), data->children()); + } + DenseRow rows(input); + const auto n = rows.numRows(); + auto offsetsBuf = AlignedBuffer::allocate(n + 1, pool); + auto* offs = offsetsBuf->asMutable(); + size_t cum = 0; + for (vector_size_t r = 0; r < n; ++r) { + offs[r] = cum; + cum += rows.rowSizes()[r]; + } + offs[n] = cum; + auto buf = AlignedBuffer::allocate(std::max(cum, 1u), pool); + rows.serialize( + reinterpret_cast(buf->asMutable()), + folly::Range(offs, n)); + return {std::move(buf), std::move(offsetsBuf)}; +} + +enum class SerdeDataKind { + kDefault, + kBigintScalar, + kBigintArray, + kBigintNestedArray, + kBigintMap, + kDoubleRandom, + kStringLen8, + kStringLen100, + kMultiScalar5Small, + kMultiScalar10Small, +}; + +struct BigintRange { + int64_t minInclusive; + int64_t maxInclusive; +}; + +struct SerdeOnlyBenchmarkCase { + RowTypePtr rowType; + SerdeDataKind dataKind{SerdeDataKind::kDefault}; + // For kBigint* data kinds: nullopt means full int64 range (i.e., the + // previous "random" case); otherwise the BIGINT values are drawn + // uniformly from [min, max]. + std::optional bigintRange{std::nullopt}; + // Fraction of null child values, applied on the kDefault fuzz path. Lets a + // case exercise the null-handling (non-fast) encode/decode path. + double nullRatio{0.0}; +}; + +RowVectorPtr makeRangeBigintData( + VectorFuzzer& fuzzer, + int64_t minValueInclusive, + int64_t maxValueInclusive) { + using namespace generator_spec_maker; + + BOLT_CHECK_LE(minValueInclusive, maxValueInclusive); + + auto rowSpec = RANDOM_ROW({RANDOM_BIGINT( + [minValueInclusive, maxValueInclusive](FuzzerGenerator& rng) -> int64_t { + return std::uniform_int_distribution( + minValueInclusive, maxValueInclusive)(rng); + })}); + auto vector = fuzzer.fuzz(*rowSpec); + auto rowVector = std::dynamic_pointer_cast(vector); + BOLT_CHECK_NOT_NULL(rowVector); + return rowVector; +} + +// Returns a generator that produces BIGINT values uniformly distributed in +// [min, max]. Captured into RANDOM_BIGINT specs below. +auto makeBigintGen(int64_t minValueInclusive, int64_t maxValueInclusive) { + return + [minValueInclusive, maxValueInclusive](FuzzerGenerator& rng) -> int64_t { + return std::uniform_int_distribution( + minValueInclusive, maxValueInclusive)(rng); + }; +} + +// Generator that produces array/map sizes uniformly in [0, 10]. Matches +// VectorFuzzer's default container-length distribution. +auto containerSizeGen() { + return [](FuzzerGenerator& rng) -> vector_size_t { + return std::uniform_int_distribution(0, 10)(rng); + }; +} + +RowVectorPtr makeRangeBigintArrayData( + VectorFuzzer& fuzzer, + int64_t minValueInclusive, + int64_t maxValueInclusive) { + using namespace generator_spec_maker; + BOLT_CHECK_LE(minValueInclusive, maxValueInclusive); + + // ROW({BIGINT, ARRAY(BIGINT in [min,max])}). + auto rowSpec = RANDOM_ROW({ + RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)), + RANDOM_ARRAY( + RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)), + containerSizeGen()), + }); + auto vector = fuzzer.fuzz(*rowSpec); + auto rowVector = std::dynamic_pointer_cast(vector); + BOLT_CHECK_NOT_NULL(rowVector); + return rowVector; +} + +RowVectorPtr makeRangeBigintNestedArrayData( + VectorFuzzer& fuzzer, + int64_t minValueInclusive, + int64_t maxValueInclusive) { + using namespace generator_spec_maker; + BOLT_CHECK_LE(minValueInclusive, maxValueInclusive); + + // ROW({BIGINT, ARRAY(ARRAY(BIGINT in [min,max]))}). + auto rowSpec = RANDOM_ROW({ + RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)), + RANDOM_ARRAY( + RANDOM_ARRAY( + RANDOM_BIGINT( + makeBigintGen(minValueInclusive, maxValueInclusive)), + containerSizeGen()), + containerSizeGen()), + }); + auto vector = fuzzer.fuzz(*rowSpec); + auto rowVector = std::dynamic_pointer_cast(vector); + BOLT_CHECK_NOT_NULL(rowVector); + return rowVector; +} + +// Flat 5-column ROW with the BIGINT constrained to [min, max] and the other +// four narrow-/fixed-width columns (DOUBLE, BOOLEAN, TINYINT, REAL) drawn +// from their default distributions. Used to measure the multi-scalar path +// when the bigint values fit in 1-byte varints. +RowVectorPtr makeMultiScalar5SmallData( + VectorFuzzer& fuzzer, + int64_t minValueInclusive, + int64_t maxValueInclusive) { + using namespace generator_spec_maker; + auto rowSpec = RANDOM_ROW({ + RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)), + RANDOM_DOUBLE([](FuzzerGenerator& rng) -> double { + return std::uniform_real_distribution(-1.0, 1.0)(rng); + }), + RANDOM_BOOLEAN([](FuzzerGenerator& rng) -> bool { + return std::uniform_int_distribution(0, 1)(rng) != 0; + }), + RANDOM_TINYINT([](FuzzerGenerator& rng) -> int8_t { + return std::uniform_int_distribution(-127, 127)(rng); + }), + RANDOM_REAL([](FuzzerGenerator& rng) -> float { + return std::uniform_real_distribution(-1.0f, 1.0f)(rng); + }), + }); + auto vector = fuzzer.fuzz(*rowSpec); + auto rowVector = std::dynamic_pointer_cast(vector); + BOLT_CHECK_NOT_NULL(rowVector); + return rowVector; +} + +// Flat 10-column ROW where every integer-width column (BIGINT, INTEGER, +// SMALLINT, TINYINT) is constrained to [min, max] so its varint encoding +// is short. DOUBLE / REAL stay fixed-width. +RowVectorPtr makeMultiScalar10SmallData( + VectorFuzzer& fuzzer, + int64_t minValueInclusive, + int64_t maxValueInclusive) { + using namespace generator_spec_maker; + auto narrowReal = [](FuzzerGenerator& rng) -> float { + return std::uniform_real_distribution(-1.0f, 1.0f)(rng); + }; + auto narrowDouble = [](FuzzerGenerator& rng) -> double { + return std::uniform_real_distribution(-1.0, 1.0)(rng); + }; + auto narrowBool = [](FuzzerGenerator& rng) -> bool { + return std::uniform_int_distribution(0, 1)(rng) != 0; + }; + const auto clamp32 = [&](int64_t v) -> int32_t { + return static_cast(std::clamp( + v, + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max()))); + }; + const auto clamp16 = [&](int64_t v) -> int16_t { + return static_cast(std::clamp( + v, + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max()))); + }; + const auto clamp8 = [&](int64_t v) -> int8_t { + return static_cast(std::clamp( + v, + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max()))); + }; + const int32_t intMin = clamp32(minValueInclusive); + const int32_t intMax = clamp32(maxValueInclusive); + const int16_t smallMin = clamp16(minValueInclusive); + const int16_t smallMax = clamp16(maxValueInclusive); + const int8_t tinyMin = clamp8(minValueInclusive); + const int8_t tinyMax = clamp8(maxValueInclusive); + + auto rowSpec = RANDOM_ROW({ + RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)), + RANDOM_INTEGER([intMin, intMax](FuzzerGenerator& rng) -> int32_t { + return std::uniform_int_distribution(intMin, intMax)(rng); + }), + RANDOM_SMALLINT([smallMin, smallMax](FuzzerGenerator& rng) -> int16_t { + return static_cast( + std::uniform_int_distribution(smallMin, smallMax)(rng)); + }), + RANDOM_TINYINT([tinyMin, tinyMax](FuzzerGenerator& rng) -> int8_t { + return static_cast( + std::uniform_int_distribution(tinyMin, tinyMax)(rng)); + }), + RANDOM_REAL(narrowReal), + RANDOM_DOUBLE(narrowDouble), + RANDOM_BOOLEAN(narrowBool), + RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)), + RANDOM_INTEGER([intMin, intMax](FuzzerGenerator& rng) -> int32_t { + return std::uniform_int_distribution(intMin, intMax)(rng); + }), + RANDOM_DOUBLE(narrowDouble), + }); + auto vector = fuzzer.fuzz(*rowSpec); + auto rowVector = std::dynamic_pointer_cast(vector); + BOLT_CHECK_NOT_NULL(rowVector); + return rowVector; +} + +RowVectorPtr makeRangeBigintMapData( + VectorFuzzer& fuzzer, + int64_t minValueInclusive, + int64_t maxValueInclusive) { + using namespace generator_spec_maker; + BOLT_CHECK_LE(minValueInclusive, maxValueInclusive); + + // ROW({BIGINT, MAP(BIGINT in [min,max], REAL)}). + auto rowSpec = RANDOM_ROW({ + RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)), + RANDOM_MAP( + RANDOM_BIGINT(makeBigintGen(minValueInclusive, maxValueInclusive)), + RANDOM_REAL([](FuzzerGenerator& rng) -> float { + return std::uniform_real_distribution(-1.0f, 1.0f)(rng); + }), + containerSizeGen()), + }); + auto vector = fuzzer.fuzz(*rowSpec); + auto rowVector = std::dynamic_pointer_cast(vector); + BOLT_CHECK_NOT_NULL(rowVector); + return rowVector; +} + +// For kBigintScalar / kBigintArray with a range, falls back to the +// fuzzer's full-range generator if `range` is nullopt. +constexpr BigintRange kFullBigintRange{ + std::numeric_limits::min(), + std::numeric_limits::max()}; + +RowVectorPtr makeSerdeOnlyData( + const SerdeOnlyBenchmarkCase& benchmarkCase, + memory::MemoryPool* pool) { + VectorFuzzer::Options options; + options.vectorSize = 1'000; + options.nullRatio = benchmarkCase.nullRatio; + + if (benchmarkCase.dataKind == SerdeDataKind::kStringLen8) { + options.stringLength = 8; + options.stringVariableLength = false; + } + + if (benchmarkCase.dataKind == SerdeDataKind::kStringLen100) { + options.stringLength = 100; + options.stringVariableLength = false; + } + + const auto seed = 1; + VectorFuzzer fuzzer(options, pool, seed); + + switch (benchmarkCase.dataKind) { + case SerdeDataKind::kBigintScalar: { + const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange); + return makeRangeBigintData( + fuzzer, range.minInclusive, range.maxInclusive); + } + case SerdeDataKind::kBigintArray: { + const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange); + return makeRangeBigintArrayData( + fuzzer, range.minInclusive, range.maxInclusive); + } + case SerdeDataKind::kBigintNestedArray: { + const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange); + return makeRangeBigintNestedArrayData( + fuzzer, range.minInclusive, range.maxInclusive); + } + case SerdeDataKind::kBigintMap: { + const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange); + return makeRangeBigintMapData( + fuzzer, range.minInclusive, range.maxInclusive); + } + case SerdeDataKind::kMultiScalar5Small: { + const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange); + return makeMultiScalar5SmallData( + fuzzer, range.minInclusive, range.maxInclusive); + } + case SerdeDataKind::kMultiScalar10Small: { + const auto range = benchmarkCase.bigintRange.value_or(kFullBigintRange); + return makeMultiScalar10SmallData( + fuzzer, range.minInclusive, range.maxInclusive); + } + case SerdeDataKind::kDoubleRandom: + case SerdeDataKind::kStringLen8: + case SerdeDataKind::kStringLen100: + case SerdeDataKind::kDefault: + // fuzzFlat (not fuzzInputRow): guarantee flat children. fuzzInputRow may + // wrap a column in a dictionary, which adds decode/null-merge cost + // (DecodedVector::setFlatNulls) unrelated to the row codec. + return std::dynamic_pointer_cast( + fuzzer.fuzzFlat(benchmarkCase.rowType)); + } + + BOLT_UNREACHABLE(); +} + +size_t computeUnsafeTotalSize( + UnsafeRowFast& unsafeRow, + const RowTypePtr& rowType, + vector_size_t numRows) { + size_t totalSize = 0; + if (auto fixedRowSize = UnsafeRowFast::fixedRowSize(rowType)) { + totalSize += fixedRowSize.value() * numRows; + } else { + for (auto i = 0; i < numRows; ++i) { + totalSize += unsafeRow.rowSize(i); + } + } + return totalSize; +} + +size_t computeCompactTotalSize( + CompactRow& compactRow, + const RowTypePtr& rowType, + vector_size_t numRows) { + size_t totalSize = 0; + if (auto fixedRowSize = CompactRow::fixedRowSize(rowType)) { + totalSize += fixedRowSize.value() * numRows; + } else { + for (auto i = 0; i < numRows; ++i) { + totalSize += compactRow.rowSize(i); + } + } + return totalSize; +} + +size_t serializeUnsafeToBuffer( + UnsafeRowFast& unsafeRow, + vector_size_t numRows, + char* rawBuffer) { + size_t offset = 0; + for (auto i = 0; i < numRows; ++i) { + offset += unsafeRow.serialize(i, rawBuffer + offset); + } + return offset; +} + +size_t serializeCompactToBuffer( + CompactRow& compactRow, + vector_size_t numRows, + char* rawBuffer) { + size_t offset = 0; + for (auto i = 0; i < numRows; ++i) { + offset += compactRow.serialize(i, rawBuffer + offset); + } + return offset; +} + +std::vector> serializeUnsafeRows( + UnsafeRowFast& unsafeRow, + vector_size_t numRows, + BufferPtr& buffer) { + std::vector> serialized; + serialized.reserve(numRows); + auto* rawBuffer = buffer->asMutable(); + + size_t offset = 0; + for (auto i = 0; i < numRows; ++i) { + auto rowSize = unsafeRow.serialize(i, rawBuffer + offset); + serialized.push_back(std::string_view(rawBuffer + offset, rowSize)); + offset += rowSize; + } + + BOLT_CHECK_EQ(buffer->size(), offset); + return serialized; +} + +std::vector serializeCompactRows( + CompactRow& compactRow, + vector_size_t numRows, + BufferPtr& buffer) { + std::vector serialized; + serialized.reserve(numRows); + auto* rawBuffer = buffer->asMutable(); + + size_t offset = 0; + for (auto i = 0; i < numRows; ++i) { + auto rowSize = compactRow.serialize(i, rawBuffer + offset); + serialized.push_back(std::string_view(rawBuffer + offset, rowSize)); + offset += rowSize; + } + + BOLT_CHECK_EQ(buffer->size(), offset); + return serialized; +} + +int unsafeSer(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) { + auto pool = memory::memoryManager()->addLeafPool(); + + folly::BenchmarkSuspender suspender; + auto data = makeSerdeOnlyData(benchmarkCase, pool.get()); + suspender.dismiss(); + + // Full Vector -> buffer: build the serializer, size it, allocate, write. + for (int i = 0; i < nIters; ++i) { + UnsafeRowFast unsafeRow(data); + const auto totalSize = + computeUnsafeTotalSize(unsafeRow, benchmarkCase.rowType, data->size()); + auto buffer = AlignedBuffer::allocate(totalSize, pool.get()); + folly::doNotOptimizeAway(serializeUnsafeToBuffer( + unsafeRow, data->size(), buffer->asMutable())); + } + return nIters * data->size(); +} + +int compactSer(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) { + auto pool = memory::memoryManager()->addLeafPool(); + + folly::BenchmarkSuspender suspender; + auto data = makeSerdeOnlyData(benchmarkCase, pool.get()); + const auto numRows = data->size(); + const auto fixed = CompactRow::fixedRowSize(benchmarkCase.rowType); + suspender.dismiss(); + + // Full Vector -> buffer: build CompactRow, compute per-row offsets (its size + // pass), allocate (pre-zeroed for null-bit handling), batch serialize. + for (int i = 0; i < nIters; ++i) { + CompactRow compactRow(data); + std::vector offsets(numRows); + size_t cum = 0; + for (vector_size_t r = 0; r < numRows; ++r) { + offsets[r] = cum; + cum += fixed ? *fixed : static_cast(compactRow.rowSize(r)); + } + auto buffer = + AlignedBuffer::allocate(std::max(cum, 1u), pool.get(), 0); + compactRow.serialize(0, numRows, offsets.data(), buffer->asMutable()); + folly::doNotOptimizeAway(buffer); + } + return nIters * numRows; +} + +int unsafeDeser(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) { + auto pool = memory::memoryManager()->addLeafPool(); + + folly::BenchmarkSuspender suspender; + auto data = makeSerdeOnlyData(benchmarkCase, pool.get()); + UnsafeRowFast unsafeRow(data); + const auto totalSize = + computeUnsafeTotalSize(unsafeRow, benchmarkCase.rowType, data->size()); + auto buffer = AlignedBuffer::allocate(totalSize, pool.get()); + auto serialized = serializeUnsafeRows(unsafeRow, data->size(), buffer); + suspender.dismiss(); + + for (int i = 0; i < nIters; ++i) { + folly::doNotOptimizeAway(UnsafeRowDeserializer::deserialize( + serialized, benchmarkCase.rowType, pool.get())); + } + return nIters * data->size(); +} + +int compactDeser(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) { + auto pool = memory::memoryManager()->addLeafPool(); + + folly::BenchmarkSuspender suspender; + auto data = makeSerdeOnlyData(benchmarkCase, pool.get()); + CompactRow compactRow(data); + const auto totalSize = + computeCompactTotalSize(compactRow, benchmarkCase.rowType, data->size()); + auto buffer = AlignedBuffer::allocate(totalSize, pool.get()); + auto serialized = serializeCompactRows(compactRow, data->size(), buffer); + suspender.dismiss(); + + for (int i = 0; i < nIters; ++i) { + folly::doNotOptimizeAway( + CompactRow::deserialize(serialized, benchmarkCase.rowType, pool.get())); + } + return nIters * data->size(); +} + +// Register a serde benchmark (func) for one case, shown as func(label). We use +// addBenchmark directly (not BENCHMARK_NAMED_PARAM_MULTI) so the lambda can +// print a one-line progress message to stderr the first time it runs — folly's +// results table only prints at the very end, so this is how you see which +// benchmark is running. +#define SERDE_BENCH(func, label, benchmarkCase) \ + FOLLY_MAYBE_UNUSED static bool FB_ANONYMOUS_VARIABLE(serdeBench) = \ + (::folly::addBenchmark( \ + __FILE__, \ + #func "(" #label ")", \ + [](unsigned nIters) -> unsigned { \ + return func(nIters, benchmarkCase); \ + }), \ + true) + +int denseSer(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) { + auto pool = memory::memoryManager()->addLeafPool(); + + folly::BenchmarkSuspender suspender; + auto data = makeSerdeOnlyData(benchmarkCase, pool.get()); + suspender.dismiss(); + + // Full Vector -> buffer: denseSerialize builds the DenseRow (which runs the + // size pass — addColumnSizes), computes offsets, allocates, and serializes + // (it also strips top-level nulls, since DenseRow is marker-less). + for (int i = 0; i < nIters; ++i) { + folly::doNotOptimizeAway(denseSerialize(data, pool.get()).buffer); + } + return nIters * data->size(); +} + +int denseDeser(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) { + auto pool = memory::memoryManager()->addLeafPool(); + + folly::BenchmarkSuspender suspender; + auto data = makeSerdeOnlyData(benchmarkCase, pool.get()); + auto serialized = denseSerialize(data, pool.get()); + const auto* bytes = serialized.buffer->as(); + const auto* offsets = serialized.rowOffsets->as(); + const auto rowCount = data->size(); + std::vector rows; + rows.reserve(rowCount); + for (vector_size_t i = 0; i < rowCount; ++i) { + rows.emplace_back(bytes + offsets[i], offsets[i + 1] - offsets[i]); + } + suspender.dismiss(); + + for (int i = 0; i < nIters; ++i) { + folly::doNotOptimizeAway( + DenseRow::deserialize(rows, benchmarkCase.rowType, pool.get())); + } + return nIters * data->size(); +} + +// The dense size pass in isolation: DenseRow construction = decode + +// addColumnSizes (no buffer write). Lets you see how much of denseSer is the +// size pass vs the byte writing. +int denseSizePass(int nIters, const SerdeOnlyBenchmarkCase& benchmarkCase) { + auto pool = memory::memoryManager()->addLeafPool(); + + folly::BenchmarkSuspender suspender; + auto data = makeSerdeOnlyData(benchmarkCase, pool.get()); + RowVectorPtr input = data; + if (data->mayHaveNulls()) { + input = std::make_shared( + pool.get(), data->type(), nullptr, data->size(), data->children()); + } + suspender.dismiss(); + + for (int i = 0; i < nIters; ++i) { + DenseRow rows(input); + folly::doNotOptimizeAway(rows.rowSizes()[0]); + } + return nIters * data->size(); +} + +// Register every format for one case as a single adjacent block, so the +// results table groups them for comparison: the three serializers, then the +// dense size pass, then the three deserializers. Cases are invoked in a +// comparable order below (scalars, then each container type with its +// value-range variants). +#define CASE_BENCHMARKS(name, benchmarkCase) \ + SERDE_BENCH(unsafeSer, name, benchmarkCase); \ + SERDE_BENCH(compactSer, name, benchmarkCase); \ + SERDE_BENCH(denseSer, name, benchmarkCase); \ + SERDE_BENCH(denseSizePass, name, benchmarkCase); \ + SERDE_BENCH(unsafeDeser, name, benchmarkCase); \ + SERDE_BENCH(compactDeser, name, benchmarkCase); \ + SERDE_BENCH(denseDeser, name, benchmarkCase) + +constexpr BigintRange kRangeLt2Pow8{-((1LL << 8) - 1), (1LL << 8) - 1}; +constexpr BigintRange kRangeLt2Pow32{-((1LL << 32) - 1), (1LL << 32) - 1}; + +const SerdeOnlyBenchmarkCase kBigintLt2Pow8{ + ROW({BIGINT()}), + SerdeDataKind::kBigintScalar, + kRangeLt2Pow8}; +const SerdeOnlyBenchmarkCase kBigintLt2Pow32{ + ROW({BIGINT()}), + SerdeDataKind::kBigintScalar, + kRangeLt2Pow32}; +const SerdeOnlyBenchmarkCase kBigintRandom{ + ROW({BIGINT()}), + SerdeDataKind::kBigintScalar}; +// Full-range BIGINT with ~40% null children: exercises the null-handling +// (non-fast) encode/decode path that the SIMD/contiguous fast paths skip. +const SerdeOnlyBenchmarkCase kBigintRandomNullable{ + ROW({BIGINT()}), + SerdeDataKind::kDefault, + std::nullopt, + 0.4}; +const SerdeOnlyBenchmarkCase kDoubleRandom{ + ROW({DOUBLE()}), + SerdeDataKind::kDoubleRandom}; +const SerdeOnlyBenchmarkCase kStringLen8{ + ROW({VARCHAR()}), + SerdeDataKind::kStringLen8}; +const SerdeOnlyBenchmarkCase kStringLen100{ + ROW({VARCHAR()}), + SerdeDataKind::kStringLen100}; +const SerdeOnlyBenchmarkCase kArrays{ + ROW({BIGINT(), ARRAY(BIGINT())}), + SerdeDataKind::kDefault}; +const SerdeOnlyBenchmarkCase kNestedArrays{ + ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}), + SerdeDataKind::kDefault}; +const SerdeOnlyBenchmarkCase kMaps{ + ROW({BIGINT(), MAP(BIGINT(), REAL())}), + SerdeDataKind::kDefault}; +const SerdeOnlyBenchmarkCase kArraysBigintLt2Pow8{ + ROW({BIGINT(), ARRAY(BIGINT())}), + SerdeDataKind::kBigintArray, + kRangeLt2Pow8}; +const SerdeOnlyBenchmarkCase kArraysBigintLt2Pow32{ + ROW({BIGINT(), ARRAY(BIGINT())}), + SerdeDataKind::kBigintArray, + kRangeLt2Pow32}; +const SerdeOnlyBenchmarkCase kNestedArraysBigintLt2Pow8{ + ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}), + SerdeDataKind::kBigintNestedArray, + kRangeLt2Pow8}; +const SerdeOnlyBenchmarkCase kNestedArraysBigintLt2Pow32{ + ROW({BIGINT(), ARRAY(ARRAY(BIGINT()))}), + SerdeDataKind::kBigintNestedArray, + kRangeLt2Pow32}; +const SerdeOnlyBenchmarkCase kMapsBigintLt2Pow8{ + ROW({BIGINT(), MAP(BIGINT(), REAL())}), + SerdeDataKind::kBigintMap, + kRangeLt2Pow8}; +const SerdeOnlyBenchmarkCase kMapsBigintLt2Pow32{ + ROW({BIGINT(), MAP(BIGINT(), REAL())}), + SerdeDataKind::kBigintMap, + kRangeLt2Pow32}; + +// Flat row of multiple simple-type columns. Exercises the top-level ROW +// driver against scalar leaf encoders (no nested ARRAY/MAP), which is the +// path most directly comparable to CompactRow's strength. +const SerdeOnlyBenchmarkCase kMultiScalar5{ + ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()}), + SerdeDataKind::kDefault}; +const SerdeOnlyBenchmarkCase kMultiScalar10{ + ROW( + {BIGINT(), + INTEGER(), + SMALLINT(), + TINYINT(), + REAL(), + DOUBLE(), + BOOLEAN(), + BIGINT(), + INTEGER(), + DOUBLE()}), + SerdeDataKind::kDefault}; + +// Small-value variants: BIGINT (and INTEGER/SMALLINT/TINYINT for the 10-col +// case) restricted to [-(2^8-1), 2^8-1] so every integer encodes in a +// single varint byte. Highlights dense's strength on narrow scalar data +// where its on-wire size drops well below CompactRow's fixed widths. +const SerdeOnlyBenchmarkCase kMultiScalar5SmallLt2Pow8{ + ROW({BIGINT(), DOUBLE(), BOOLEAN(), TINYINT(), REAL()}), + SerdeDataKind::kMultiScalar5Small, + kRangeLt2Pow8}; +const SerdeOnlyBenchmarkCase kMultiScalar10SmallLt2Pow8{ + ROW( + {BIGINT(), + INTEGER(), + SMALLINT(), + TINYINT(), + REAL(), + DOUBLE(), + BOOLEAN(), + BIGINT(), + INTEGER(), + DOUBLE()}), + SerdeDataKind::kMultiScalar10Small, + kRangeLt2Pow8}; + +// --- Scalars: BIGINT swept by value range, then double / strings. --- +CASE_BENCHMARKS(bigint_lt_2pow8, kBigintLt2Pow8); +CASE_BENCHMARKS(bigint_lt_2pow32, kBigintLt2Pow32); +CASE_BENCHMARKS(bigint_random, kBigintRandom); +CASE_BENCHMARKS(bigint_random_nullable, kBigintRandomNullable); +CASE_BENCHMARKS(double_random, kDoubleRandom); +CASE_BENCHMARKS(string_len8, kStringLen8); +CASE_BENCHMARKS(string_len100, kStringLen100); + +// --- Multi-column flat rows: 5- and 10-column, full vs small-int. --- +CASE_BENCHMARKS(multiScalar5, kMultiScalar5); +CASE_BENCHMARKS(multiScalar5_small_lt_2pow8, kMultiScalar5SmallLt2Pow8); +CASE_BENCHMARKS(multiScalar10, kMultiScalar10); +CASE_BENCHMARKS(multiScalar10_small_lt_2pow8, kMultiScalar10SmallLt2Pow8); + +// --- Containers: each type next to its value-range variants. --- +CASE_BENCHMARKS(arrays, kArrays); +CASE_BENCHMARKS(arrays_bigint_lt_2pow8, kArraysBigintLt2Pow8); +CASE_BENCHMARKS(arrays_bigint_lt_2pow32, kArraysBigintLt2Pow32); +CASE_BENCHMARKS(nestedArrays, kNestedArrays); +CASE_BENCHMARKS(nestedArrays_bigint_lt_2pow8, kNestedArraysBigintLt2Pow8); +CASE_BENCHMARKS(nestedArrays_bigint_lt_2pow32, kNestedArraysBigintLt2Pow32); +CASE_BENCHMARKS(maps, kMaps); +CASE_BENCHMARKS(maps_bigint_lt_2pow8, kMapsBigintLt2Pow8); +CASE_BENCHMARKS(maps_bigint_lt_2pow32, kMapsBigintLt2Pow32); + +} // namespace + +// =========================================================================== +// Size-pass microbenchmark: times the REAL scalar::addColumnSizes on a flat +// nullable BIGINT column across value magnitudes. Calling the compiled function +// (not a local copy) keeps it from auto-vectorizing in this TU, so this matches +// the production embedded behavior. A/B the internal kernel by toggling the +// SIMD wiring in scalar::addColumnSizes and rebuilding. +// =========================================================================== +namespace size_bench { +constexpr vector_size_t kN = 4096; + +struct SizeInput { + VectorPtr vec; // flat nullable BIGINT + DecodedVector decoded; + std::vector rowSizes; +}; + +// magnitude: 0 = small [-100,100], 1 = full int32, 2 = full int64; ~10% nulls. +std::unique_ptr makeInput(memory::MemoryPool* pool, int mag) { + auto in = std::make_unique(); + in->vec = BaseVector::create(BIGINT(), kN, pool); + auto* flat = in->vec->asUnchecked>(); + auto* raw = flat->mutableRawValues(); + std::mt19937_64 rng(0x9E3779B97F4A7C15ull ^ static_cast(mag)); + for (vector_size_t i = 0; i < kN; ++i) { + if (mag == 0) { + raw[i] = static_cast(rng() % 201) - 100; + } else if (mag == 1) { + raw[i] = static_cast(static_cast(rng())); + } else { + raw[i] = static_cast(rng()); + } + if (rng() % 10 == 0) { + flat->setNull(i, true); + } + } + in->decoded.decode(*in->vec); + in->rowSizes.assign(kN, 0); + return in; +} + +// Dictionary-wrapped (reversed indices) → non-identity, so addColumnSizes takes +// the SCALAR nullableInt64SerializedSize loop instead of the SIMD kernel. +std::unique_ptr makeDictInput(memory::MemoryPool* pool, int mag) { + auto in = makeInput(pool, mag); // reuse value/null generation + auto flat = in->vec; // the flat nullable BIGINT + auto indices = allocateIndices(kN, pool); + auto* idx = indices->asMutable(); + for (vector_size_t i = 0; i < kN; ++i) { + idx[i] = kN - 1 - i; // reversed -> non-identity + } + in->vec = BaseVector::wrapInDictionary(nullptr, indices, kN, flat); + in->decoded.decode(*in->vec); + return in; +} + +} // namespace size_bench + +#define SIZE_BENCH(tag, mag) \ + BENCHMARK(addColumnSizes_##tag) { \ + static auto pool = memory::memoryManager()->addLeafPool(); \ + static auto in = size_bench::makeInput(pool.get(), mag); \ + dense_row::scalar::addColumnSizes( \ + *BIGINT(), in->decoded, size_bench::kN, in->rowSizes.data()); \ + folly::doNotOptimizeAway(in->rowSizes[0]); \ + } \ + BENCHMARK(addColumnSizes_dict_##tag) { \ + static auto pool = memory::memoryManager()->addLeafPool(); \ + static auto in = size_bench::makeDictInput(pool.get(), mag); \ + dense_row::scalar::addColumnSizes( \ + *BIGINT(), in->decoded, size_bench::kN, in->rowSizes.data()); \ + folly::doNotOptimizeAway(in->rowSizes[0]); \ + } + +SIZE_BENCH(small, 0) +SIZE_BENCH(medium_i32, 1) +SIZE_BENCH(large_i64, 2) + +// Printed once at the end (after folly's timing table): the serialized size of +// each case in all three row formats (bytes/row). The benchmark table itself +// shows only timings. +void printSerializedSizes() { + struct NamedCase { + const char* name; + const SerdeOnlyBenchmarkCase* benchmarkCase; + }; + static const NamedCase cases[] = { + {"bigint_lt_2pow8", &kBigintLt2Pow8}, + {"bigint_lt_2pow32", &kBigintLt2Pow32}, + {"bigint_random", &kBigintRandom}, + {"bigint_random_nullable", &kBigintRandomNullable}, + {"double_random", &kDoubleRandom}, + {"string_len8", &kStringLen8}, + {"string_len100", &kStringLen100}, + {"multiScalar5", &kMultiScalar5}, + {"multiScalar5_small_lt_2pow8", &kMultiScalar5SmallLt2Pow8}, + {"multiScalar10", &kMultiScalar10}, + {"multiScalar10_small_lt_2pow8", &kMultiScalar10SmallLt2Pow8}, + {"arrays", &kArrays}, + {"arrays_bigint_lt_2pow8", &kArraysBigintLt2Pow8}, + {"arrays_bigint_lt_2pow32", &kArraysBigintLt2Pow32}, + {"nestedArrays", &kNestedArrays}, + {"nestedArrays_bigint_lt_2pow8", &kNestedArraysBigintLt2Pow8}, + {"nestedArrays_bigint_lt_2pow32", &kNestedArraysBigintLt2Pow32}, + {"maps", &kMaps}, + {"maps_bigint_lt_2pow8", &kMapsBigintLt2Pow8}, + {"maps_bigint_lt_2pow32", &kMapsBigintLt2Pow32}, + }; + + std::printf("\n=== serialized size (bytes/row) ===\n"); + std::printf("%-30s %8s %8s %8s\n", "case", "unsafe", "compact", "dense"); + auto pool = memory::memoryManager()->addLeafPool(); + for (const auto& nc : cases) { + auto data = makeSerdeOnlyData(*nc.benchmarkCase, pool.get()); + const auto rows = data->size(); + UnsafeRowFast unsafeRow(data); + CompactRow compactRow(data); + const auto u = + computeUnsafeTotalSize(unsafeRow, nc.benchmarkCase->rowType, rows) / + rows; + const auto c = + computeCompactTotalSize(compactRow, nc.benchmarkCase->rowType, rows) / + rows; + const auto d = denseSerialize(data, pool.get()).buffer->size() / rows; + std::printf("%-30s %8zu %8zu %8zu\n", nc.name, u, c, d); + } + std::fflush(stdout); +} + +} // namespace bytedance::bolt::row + +int main(int argc, char** argv) { + folly::init(&argc, &argv); + bytedance::bolt::memory::MemoryManager::initialize({}); + folly::runBenchmarks(); + bytedance::bolt::row::printSerializedSizes(); + return 0; +} diff --git a/bolt/row/dense/DenseRow.cpp b/bolt/row/dense/DenseRow.cpp new file mode 100644 index 000000000..53e898a31 --- /dev/null +++ b/bolt/row/dense/DenseRow.cpp @@ -0,0 +1,322 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bolt/row/dense/DenseRow.h" + +// ============================================================================= +// Wire format (in-tree spec — frozen, byte-identical across this codebase) +// ============================================================================= +// +// Encoding is purely TYPE-DRIVEN: every value is encoded according to its +// vector's concrete type. A row's blob is the concatenation of its fields, +// each encoded by its own type. There is no top-level row marker — the caller +// frames rows and guarantees no top-level null rows. +// +// Core property: LEVEL-HOISTED. At every nesting level the structural bytes for +// all of that level's positions (nested ROW markers / ARRAY|MAP cardinalities / +// VARCHAR lengths) are written first, then the level descends into children. +// Each row's whole blob (all levels) lives in that row's own byte range. +// +// row_blob := encode(field_0) ... encode(field_{k-1}) +// +// encode(T): +// TINYINT/SMALLINT/INTEGER/BIGINT/TIMESTAMP: +// null -> 0x00 | INT64_MIN -> 0x80 0x00 +// else -> varint(zigzag(adjust(v))), adjust(v) = v > 0 ? v : v - 1 +// BOOLEAN: varint(0 = null | 1 = false | 2 = true) +// REAL: 4B LE float bits; sentinel 0x7fc00000 = null +// (a non-null value colliding with the sentinel is +// bit-flipped on encode and restored on decode) +// DOUBLE: 8B LE; sentinel 0x7ff8000000000000 = null (as above) +// VARCHAR/VARBINARY: varint(len + 1) (0 = null), then len payload bytes. +// Under a multi-position level, ALL lengths are written +// before ALL payloads. +// HUGEINT: nullable-int64 of zigzag128(value)'s low 64 bits (its +// 0x00 sentinel = null, no separate tag); when non-null, +// followed by varint(high 64 bits of zigzag128(value)) +// UNKNOWN: varint(0) (always null) +// ROW: per position varint(0 = null | 1 = present), then +// recurse each field (null positions emit only the +// marker and are filtered from children via parentNulls) +// ARRAY: per position varint(0 = null | cardinality + 1), then +// recurse the element column over the child positions +// MAP: per position varint(0 = null | cardinality + 1), then +// recurse the keys column, then the values column +// +// Frozen invariants: the INT64_MIN sentinel, cardinality + 1, MAP's keys-then- +// values segment ordering, and the level-hoisted ordering above. Empty +// array/map (cardinality+1 = 1) is distinct from null (0). +// +// Routing is per top-level field by type: scalar fields take the scalar +// column-at-a-time path, complex (ARRAY/MAP/ROW) fields take the general +// column-batch path; a row whose fields are all scalar uses a dedicated +// fast path that skips the general scaffolding entirely. The codec kernels +// live in sibling TUs, all declared in DenseRowGeneral.h: +// * DenseRowGeneralEncode.cpp general column-batch encode +// * DenseRowGeneralDecode.cpp general column-batch decode +// * DenseRowScalar.{h,*.cpp} scalar column fast path +// This file is the public API layer (the DenseRow class) only. +// ============================================================================= + +#include +#include +#include +#include + +#include "bolt/row/dense/DenseRowGeneral.h" +#include "bolt/row/dense/DenseRowScalar.h" +#include "bolt/vector/ComplexVector.h" +#include "vector/DecodedVector.h" + +namespace bytedance::bolt::row { + +using namespace dense_row; + +struct DenseRow::State { + // Keeps the input column data alive for this DenseRow's lifetime. + RowVectorPtr rowVector; + vector_size_t numRows{0}; + std::vector rowSizes; + size_t totalSize{0}; + + // Routing is per top-level field by type: a scalar field is decoded and + // sized/written column-at-a-time (DecodecVector); a complex (ARRAY/MAP/ROW) + // field goes through the general column-batch path (ColumnPlan). Both vectors + // are sized to fieldCount; for field k exactly one entry is populated. + std::vector> decodedOrPlans; + + // Top-level slot view ({r, 1} per row) for the complex columns. Only built + // when the row has a complex field (an all-scalar row leaves it empty). The + // nested slot trees live in each ARRAY/MAP node's ColumnPlan::childSlots, + // built by the size pass and replayed by the write pass. + TopSlotView topView; +}; + +DenseRow::DenseRow(const RowVectorPtr& rowVector) + : state_(std::make_unique()) { + auto& st = *state_; + st.rowVector = rowVector; + const auto numRows = rowVector->size(); + st.numRows = numRows; + st.rowSizes.assign(numRows, 0); + const auto& rowType = rowVector->type()->asRow(); + const auto fieldCount = rowType.size(); + + if (numRows > 0) { + // Route each top-level field by type: a scalar field is sized + // column-at-a-time straight into rowSizes; a complex (ARRAY/MAP/ROW) field + // runs the general SizeSink pass (which also builds the slot tree reused by + // the write pass) and accumulates into sizeSinks. The general scaffolding + // (slot view + sink array + plan slots) is allocated only when a complex + // field is present, so an all-scalar row pays nothing extra. Complex fields + // are visited in field order so the slot tree replays in that order during + // serialize(). + bool anyComplex = false; + for (size_t k = 0; k < fieldCount; ++k) { + if (!rowType.childAt(k)->isPrimitiveType()) { + anyComplex = true; + break; + } + } + + st.decodedOrPlans.resize(fieldCount); + std::vector sizeSinks; + if (anyComplex) { + st.topView = makeTopView(numRows); + sizeSinks.resize(numRows); + } + + for (size_t k = 0; k < fieldCount; ++k) { + const auto& childType = rowType.childAt(k); + if (childType->isPrimitiveType()) { + st.decodedOrPlans[k].emplace(); + auto* decoded = std::get_if(&st.decodedOrPlans[k]); + + decoded->decode(*rowVector->childAt(k)); + scalar::addColumnSizes( + *childType, *decoded, numRows, st.rowSizes.data()); + } else { + st.decodedOrPlans[k].emplace( + buildPlan(childType, rowVector->childAt(k))); + auto* plan = std::get_if(&st.decodedOrPlans[k]); + encodeColumnBatch( + *childType, + *plan, + st.topView.view(), + folly::Range(sizeSinks.data(), numRows), + /*rowNulls=*/nullptr); + } + } + if (anyComplex) { + for (vector_size_t r = 0; r < numRows; ++r) { + st.rowSizes[r] += sizeSinks[r].bytes; + } + } + } + + size_t total = 0; + for (size_t s : st.rowSizes) { + total += s; + } + st.totalSize = total; +} + +DenseRow::DenseRow(DenseRow&&) noexcept = default; +DenseRow& DenseRow::operator=(DenseRow&&) noexcept = default; +DenseRow::~DenseRow() = default; + +vector_size_t DenseRow::numRows() const { + return state_->numRows; +} + +const std::vector& DenseRow::rowSizes() const { + return state_->rowSizes; +} + +size_t DenseRow::rowSizeAt(vector_size_t index) const { + return state_->rowSizes[index]; +} + +size_t DenseRow::totalSize() const { + return state_->totalSize; +} + +void DenseRow::serialize(uint8_t* base, folly::Range offsets) + const { + const auto numRows = state_->numRows; + BOLT_USER_CHECK_EQ( + offsets.size(), + static_cast(numRows), + "DenseRow::serialize offsets size mismatch"); + if (numRows == 0) { + return; + } + const auto& rowType = state_->rowVector->type()->asRow(); + const auto fieldCount = rowType.size(); + + // Write fields in declaration order, sharing one per-row write cursor so each + // field lands at the right offset. Scalar fields advance the cursor directly; + // complex fields run the general WriteSink pass (replaying the cached slot + // tree), syncing the cursor across the call. writeSinks is allocated only if + // a complex field is present (an all-scalar row never touches it). + std::vector cursors(numRows); + for (vector_size_t r = 0; r < numRows; ++r) { + cursors[r] = base + offsets[r]; + } + std::vector writeSinks; + for (size_t k = 0; k < fieldCount; ++k) { + const auto& childType = rowType.childAt(k); + std::visit( + [&](auto& decodedOrPlan) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + BOLT_CHECK(childType->isPrimitiveType()); + scalar::writeColumn( + *childType, decodedOrPlan, numRows, cursors.data()); + } else { + static_assert(std::is_same_v); + if (writeSinks.empty()) { + writeSinks.resize(numRows); + } + for (vector_size_t r = 0; r < numRows; ++r) { + writeSinks[r].out = cursors[r]; + } + encodeColumnBatch( + *childType, + decodedOrPlan, + state_->topView.view(), + folly::Range(writeSinks.data(), numRows), + /*rowNulls=*/nullptr); + for (vector_size_t r = 0; r < numRows; ++r) { + cursors[r] = writeSinks[r].out; + } + } + }, + state_->decodedOrPlans[k]); + } + + for (vector_size_t r = 0; r < numRows; ++r) { + const auto* rowStart = base + offsets[r]; + const auto actualSize = static_cast(cursors[r] - rowStart); + const auto expectedSize = state_->rowSizes[r]; + BOLT_CHECK_EQ( + actualSize, + expectedSize, + "DenseRow::serialize row size mismatch at row {}, offset {}", + r, + offsets[r]); + } +} + +RowVectorPtr DenseRow::deserialize( + const std::vector& data, + const RowTypePtr& rowType, + memory::MemoryPool* pool) { + const auto rowCount = static_cast(data.size()); + + // Decode fields in declaration order, sharing one per-row read cursor + // (marker-less, so no top-level nulls). Scalar fields read column-at-a-time; + // complex fields run the general decode over the top slot view, which is + // built only when a complex field is present. Mirrors the per-column + // serialize path. + auto out = BaseVector::create(rowType, rowCount, pool); + auto* rowVec = out->asUnchecked(); + std::vector cursors(rowCount); + for (vector_size_t r = 0; r < rowCount; ++r) { + cursors[r].cur = reinterpret_cast(data[r].data()); + cursors[r].end = cursors[r].cur + data[r].size(); + rowVec->setNull(r, false); + } + + const auto cursorRange = folly::Range(cursors.data(), rowCount); + const auto fieldCount = rowType->size(); + bool anyComplex = false; + for (size_t k = 0; k < fieldCount; ++k) { + if (!rowType->childAt(k)->isPrimitiveType()) { + anyComplex = true; + break; + } + } + TopSlotView top; + if (anyComplex) { + top = makeTopView(rowCount); + } + for (size_t k = 0; k < fieldCount; ++k) { + const auto& childType = rowType->childAt(k); + if (childType->isPrimitiveType()) { + scalar::readColumn( + *childType, *rowVec->childAt(k), rowCount, cursorRange); + } else { + decodeColumnBatch( + *childType, + *rowVec->childAt(k), + top.view(), + cursorRange, + /*rowNulls=*/nullptr); + } + } + + for (vector_size_t r = 0; r < rowCount; ++r) { + BOLT_USER_CHECK( + cursors[r].cur == cursors[r].end, + "DenseRow: row {} not fully consumed ({} bytes remaining)", + r, + cursors[r].end - cursors[r].cur); + } + return std::dynamic_pointer_cast(out); +} + +} // namespace bytedance::bolt::row diff --git a/bolt/row/dense/DenseRow.h b/bolt/row/dense/DenseRow.h new file mode 100644 index 000000000..b169db3b8 --- /dev/null +++ b/bolt/row/dense/DenseRow.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include "bolt/vector/BaseVector.h" +#include "bolt/vector/ComplexVector.h" + +// Dense row serializer — sibling to CompactRow / UnsafeRowFast, but +// column-batched (processes all rows at once) rather than row-at-a-time, so it +// exposes only batch operations (no single-row rowSize(i) / serialize(i)). +// +// The wire is the "dense", no-waste layout +// 1. variable-length (varint) values +// 2. nulls fused into the structure bytes (no null bitmap) +// 3. no alignment padding, +// 4. level-hoisted nesting. +// The grammar is documented at the top of DenseRow.cpp. +// +// Usage (mirrors CompactRow): +// DenseRow rows(rowVector); // builds plan + sizes (once) +// auto offsets = to_offsets(rows.rowSizes()); +// rows.serialize(base, offsets); // write all rows at the offsets +// +// auto rv = DenseRow::deserialize(ranges, rowType, pool); +namespace bytedance::bolt::row { + +class DenseRow { + public: + explicit DenseRow(const RowVectorPtr& vector); + DenseRow(DenseRow&&) noexcept; + DenseRow& operator=(DenseRow&&) noexcept; + ~DenseRow(); + + vector_size_t numRows() const; + + size_t rowSizeAt(vector_size_t index) const; + + // Per-row encoded byte counts (all rows). DenseRow precomputes these in its + // size pass, so this bulk accessor is free; rowSizeAt() indexes into it. + const std::vector& rowSizes() const; + + // Sum of rowSizes() + size_t totalSize() const; + + // Serialize every row into `base + offsets[r]`. `offsets.size()` must equal + // numRows(); row r writes exactly rowSizes()[r] bytes. + void serialize(uint8_t* base, folly::Range offsets) const; + + // Reconstruct a flat RowVector of `rowType` from pre-split per-row byte + // ranges (one entry per row). Inverse of serialize(). + static RowVectorPtr deserialize( + const std::vector& data, + const RowTypePtr& rowType, + memory::MemoryPool* pool); + + private: + struct State; + std::unique_ptr state_; +}; + +} // namespace bytedance::bolt::row diff --git a/bolt/row/dense/DenseRowGeneral.h b/bolt/row/dense/DenseRowGeneral.h new file mode 100644 index 000000000..3ae37d78d --- /dev/null +++ b/bolt/row/dense/DenseRowGeneral.h @@ -0,0 +1,295 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include "bolt/common/base/Nulls.h" +#include "bolt/row/dense/IntVarint.h" +#include "bolt/vector/BaseVector.h" +#include "bolt/vector/DecodedVector.h" + +// Internal declarations for the GENERAL (non-flat) dense-row codec: the +// column-batch machinery that handles arbitrary nesting (ARRAY/MAP/ROW) and +// dictionary/constant inputs. Shared between its two implementation TUs — +// DenseRowGeneralEncode.cpp and DenseRowGeneralDecode.cpp — and included by the +// DenseRow public API layer (DenseRow.cpp) and the scalar fast path +// (DenseRowScalar*.cpp) for the common varint/slot helpers below. +// +// Encode and decode live in separate TUs so their (cache-alignment-sensitive) +// machine-code layout is not perturbed by unrelated edits to the other side — +// see the note on intra-TU layout sensitivity in DenseRow.cpp. +namespace bytedance::bolt::row::dense_row { + +// The BMI2 fast path is selected at compile time inside IntVarint.h (gated by +// the x86_64 `#if`), so these are just the detail helpers under this namespace. +using detail::readNullableInt128; +using detail::readNullableInt64; +using detail::readVarint; +using detail::writeNullableInt128; +using detail::writeNullableInt64; +using detail::writeVarint; + +// Null sentinels for REAL/DOUBLE: a non-null value whose raw bits collide with +// the sentinel is bit-flipped on encode and restored on decode. +constexpr uint32_t kNullFloatBits{0x7fc00000U}; +constexpr uint64_t kNullDoubleBits{0x7ff8000000000000ULL}; + +// ============================================================================= +// Shared slot machinery for column-batch encode/decode. +// ============================================================================= +// +// Each call processes N source rows. At every recursion level, each source row +// contributes zero or more contiguous slot ranges in the level's vector. +// SlotView.slots is a flat array of (base, count) ranges; the entries for +// source row r occupy slots[rowBoundaries[r]..rowBoundaries[r+1]). +// +// POSITION SPACE: slot positions index the CURRENT level's vector — for a +// nested ARRAY/MAP level that is the child elements vector's own position +// space (built from the parent's rawOffsets/rawSizes), NOT the parent's. The +// top-level SlotView indexes the top vector ({r, 1} per row). `parentNulls`, if +// set, is indexed by these same current-level positions. A leaf encoder maps a +// position `p` to the decoded value via `plan.decoded` (identity, or +// `decoded.index(p)` for dictionary/constant inputs) — so `p` is the decoded +// vector's position, and rawOffsets/rawSizes for ARRAY/MAP are read at the +// decoded index. +// +// Multiple ranges per row are necessary because ArrayVector/MapVector input can +// have non-contiguous child layouts (gaps between adjacent parent slots), so +// each non-null parent slot contributes its own child range. Decoded output +// vectors are always packed contiguously, so on decode each source row's child +// ranges may happen to be back-to-back, but the representation stays uniform. +struct SlotRange { + uint32_t base; + uint32_t count; +}; + +struct SlotView { + folly::Range slots; + folly::Range rowBoundaries; // size N+1 + // Per-position null bitmap inherited from ancestor ROWs. Indexed by the + // current level's vector positions. nullptr means no filter. + const uint64_t* parentNulls = nullptr; +}; + +struct RowCursor { + const uint8_t* cur; + const uint8_t* end; +}; + +// Iterate over a single source row's live positions. Walks every slot range +// belonging to row r and every position inside it, skipping positions covered +// by parentNulls. +template +FOLLY_ALWAYS_INLINE void forEachLivePos(SlotView v, vector_size_t r, F f) { + const uint64_t* nulls = v.parentNulls; + const auto* slots = v.slots.data(); + const auto lo = v.rowBoundaries[r]; + const auto hi = v.rowBoundaries[r + 1]; + if (!nulls) { + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = slots[i]; + const uint32_t end = sr.base + sr.count; + for (uint32_t p = sr.base; p < end; ++p) { + f(p); + } + } + } else { + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = slots[i]; + const uint32_t end = sr.base + sr.count; + for (uint32_t p = sr.base; p < end; ++p) { + if (bits::isBitNull(nulls, static_cast(p))) { + continue; + } + f(p); + } + } + } +} + +// Top-level SlotView covering every position [0, rowCount): one {r, 1} slot per +// source row. Used by both serialize and deserialize entry points. +struct TopSlotView { + std::vector slots; + std::vector boundaries; + + SlotView view() { + return SlotView{ + {slots.data(), slots.size()}, + {boundaries.data(), boundaries.size()}, + nullptr}; + } +}; + +// TODO delete TopSlotView +inline TopSlotView makeTopView(vector_size_t rowCount) { + TopSlotView tv; + tv.slots.resize(rowCount); + tv.boundaries.resize(rowCount + 1); + for (vector_size_t r = 0; r < rowCount; ++r) { + tv.slots[r] = {static_cast(r), 1u}; + tv.boundaries[r] = static_cast(r); + } + tv.boundaries[rowCount] = static_cast(rowCount); + return tv; +} + +// ============================================================================= +// ENCODE — column-batch encode kernels (DenseRowGeneralEncode.cpp). +// ============================================================================= + +// Backing storage for one nested ARRAY/MAP level's child SlotView: the child +// slot ranges + per-source-row boundaries. Built by the SizeSink pass and read +// back by the WriteSink pass (see ColumnPlan::childSlots). +struct SlotTreeNode { + folly::small_vector slots; + folly::small_vector boundaries; +}; + +// One node of the per-row encode plan: a column at one nesting level. Caches +// that column's DecodedVector (so reads see through dictionary/constant +// wrapping) and, for nested types, its child plans. A flat tagged struct (no +// inheritance): `kind` selects how `children` is interpreted — +// ARRAY -> {elements}, MAP -> {keys, values}, ROW -> fields, scalar -> {}. +// `buildPlan` produces the tree once; both the size and write passes reuse it. +// The concrete ArrayVector/MapVector/RowVector base is recovered at the use +// site via decoded.base()->as<...>(). +struct ColumnPlan { + TypeKind kind{TypeKind::UNKNOWN}; + DecodedVector decoded; + bool mayHaveNulls{false}; + bool isNullColumn{false}; + std::vector children; + // The vector this node's `decoded` reads, held so the node is self-contained: + // both `decoded` and the buffers it points into stay valid for the node's + // lifetime, independent of who else references the input. Null only for an + // all-null (missing) ROW child. + VectorPtr source; + // ARRAY/MAP only: the child SlotView's storage, built into the plan tree by + // the SizeSink pass and replayed by the WriteSink pass — so each level reads + // its own slots straight off the tree (no shared cursor/scratch between + // passes). `mutable` because the (size) pass fills it through a `const` + // ColumnPlan&; it is a derived cache, not part of the plan's identity. + mutable SlotTreeNode childSlots; +}; + +ColumnPlan buildPlan(const TypePtr& type, const VectorPtr& vector); + +// A "sink" abstracts the size pass vs the write pass: encodeColumnBatch is +// templated on it so both passes share one implementation (byte counts and +// bytes-written cannot drift). SizeSink accumulates a byte count; WriteSink +// writes bytes through a moving cursor. +// +// The size pass does not naively walk every value: fixed-width leaves +// (BOOLEAN/REAL/DOUBLE) add count*width analytically, and integer leaves use +// the SIMD-batched sumNullableIntSizes; only variable-length leaves (VARCHAR) +// are walked per value, which is unavoidable. The SizeSink pass also builds the +// slot tree that the write pass reuses for nested ARRAY/MAP, so it is not just +// a size computation that could be replaced by a closed-form estimate. +struct SizeSink { + size_t bytes{0}; + + FOLLY_ALWAYS_INLINE void putVarint(uint64_t v) { + bytes += detail::varintSize(v); + } + FOLLY_ALWAYS_INLINE void putNullableInt64(int64_t v, bool isNull) { + bytes += detail::nullableInt64SerializedSize(v, isNull); + } + FOLLY_ALWAYS_INLINE void putRaw(const void* /*p*/, size_t n) { + bytes += n; + } + template + FOLLY_ALWAYS_INLINE void putFixed(const T& /*v*/) { + bytes += sizeof(T); + } +}; + +struct WriteSink { + uint8_t* out{nullptr}; + + FOLLY_ALWAYS_INLINE void putVarint(uint64_t v) { + out = writeVarint(v, out); + } + FOLLY_ALWAYS_INLINE void putNullableInt64(int64_t v, bool isNull) { + out = writeNullableInt64(v, isNull, out); + } + FOLLY_ALWAYS_INLINE void putRaw(const void* p, size_t n) { + std::memcpy(out, p, n); + out += n; + } + template + FOLLY_ALWAYS_INLINE void putFixed(const T& v) { + std::memcpy(out, &v, sizeof(T)); + out += sizeof(T); + } +}; + +// Encode one column (any type) for N source rows into the per-row sinks. The +// SizeSink pass fills each ARRAY/MAP node's ColumnPlan::childSlots; the +// WriteSink pass reads them back. Instantiated for SizeSink and WriteSink in +// DenseRowGeneralEncode.cpp. +template +void encodeColumnBatch( + const Type& type, + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls); + +// Encode a ROW level directly (entry point for the marker-less serializer). +// emitMarker=false omits the per-position present/null marker (caller asserts +// no nulls at this level). +template +void encodeRowBatch( + const Type& type, + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls, + bool emitMarker = true); + +// ============================================================================= +// DECODE — column-batch decode kernels (DenseRowGeneralDecode.cpp). +// ============================================================================= + +// Decode entry points. Both are mutually recursive across the type dispatch. +// `readMarker == false` is the marker-less shuffle contract (caller asserts +// every top-level row is non-null). +void decodeColumnBatch( + const Type& type, + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls); + +void decodeRowBatch( + const Type& type, + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls, + bool readMarker = true); + +} // namespace bytedance::bolt::row::dense_row diff --git a/bolt/row/dense/DenseRowGeneralDecode.cpp b/bolt/row/dense/DenseRowGeneralDecode.cpp new file mode 100644 index 000000000..24bec611d --- /dev/null +++ b/bolt/row/dense/DenseRowGeneralDecode.cpp @@ -0,0 +1,638 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Decode (deserialize) kernels for the level-hoisted dense row format. +// +// These nested-container decode loops are pathologically sensitive to machine- +// code layout: on byte-identical source, small shifts in surrounding code swing +// individual cases (nested ARRAY/MAP/long-string deserialize) by ~10-14% purely +// from cache-line / branch-predictor aliasing — see the intra-TU layout note in +// DenseRow.cpp. They live in their own translation unit so that unrelated +// edits to the encode/serializer code no longer re-roll that layout lottery; +// the decode layout is now determined solely by this file. Do not merge these +// kernels back into another TU, and re-run dense_row_serialize_benchmark +// (dense_deserialize_*) after any change here. + +#include "bolt/row/dense/DenseRowGeneral.h" + +#include +#include +#include +#include + +#include + +#include "bolt/vector/ComplexVector.h" +#include "bolt/vector/FlatVector.h" + +namespace bytedance::bolt::row::dense_row { + +// ============================================================================= +// Decode side +// ============================================================================= + +template +void decodeIntegerBatch( + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* flat = dst.asUnchecked>(); + auto* raw = flat->mutableRawValues(); + const auto N = static_cast(cursors.size()); + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + forEachLivePos(out, r, [&](uint32_t p) { + bool isNull{false}; + int64_t v{0}; + BOLT_USER_CHECK( + readNullableInt64(c.cur, c.end, isNull, v), + "DenseRow: malformed integer value at row {}", + r); + if (isNull) { + flat->setNull(static_cast(p), true); + } else { + if constexpr (!std::is_same_v) { + BOLT_USER_CHECK( + v >= static_cast(std::numeric_limits::min()) && + v <= static_cast(std::numeric_limits::max()), + "DenseRow: integer value out of range at row {}: {}", + r, + v); + } + flat->setNull(static_cast(p), false); + raw[p] = static_cast(v); + } + }); + } +} + +void decodeBooleanBatch( + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* flat = dst.asUnchecked>(); + const auto N = static_cast(cursors.size()); + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + forEachLivePos(out, r, [&](uint32_t p) { + uint64_t v{0}; + BOLT_USER_CHECK( + readVarint(c.cur, c.end, v), + "DenseRow: malformed boolean at row {}", + r); + if (v == 0) { + flat->setNull(static_cast(p), true); + } else { + BOLT_USER_CHECK( + v <= 2, "DenseRow: invalid boolean encoding at row {}: {}", r, v); + flat->setNull(static_cast(p), false); + flat->set(static_cast(p), v == 2); + } + }); + } +} + +void decodeRealBatch( + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* flat = dst.asUnchecked>(); + auto* raw = flat->mutableRawValues(); + const auto N = static_cast(cursors.size()); + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + forEachLivePos(out, r, [&](uint32_t p) { + BOLT_USER_CHECK( + static_cast(c.end - c.cur) >= sizeof(uint32_t), + "DenseRow: truncated real at row {}", + r); + uint32_t b; + std::memcpy(&b, c.cur, sizeof(b)); + c.cur += sizeof(b); + if (b == kNullFloatBits) { + flat->setNull(static_cast(p), true); + } else { + flat->setNull(static_cast(p), false); + if (FOLLY_UNLIKELY(b == (kNullFloatBits ^ 1u))) { + b ^= 1u; + } + std::memcpy(raw + p, &b, sizeof(b)); + } + }); + } +} + +void decodeDoubleBatch( + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* flat = dst.asUnchecked>(); + auto* raw = flat->mutableRawValues(); + const auto N = static_cast(cursors.size()); + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + forEachLivePos(out, r, [&](uint32_t p) { + BOLT_USER_CHECK( + static_cast(c.end - c.cur) >= sizeof(uint64_t), + "DenseRow: truncated double at row {}", + r); + uint64_t b; + std::memcpy(&b, c.cur, sizeof(b)); + c.cur += sizeof(b); + if (b == kNullDoubleBits) { + flat->setNull(static_cast(p), true); + } else { + flat->setNull(static_cast(p), false); + if (FOLLY_UNLIKELY(b == (kNullDoubleBits ^ 1ull))) { + b ^= 1ull; + } + std::memcpy(raw + p, &b, sizeof(b)); + } + }); + } +} + +// Mirror of encodeHugeintBatch: nullableInt64(low 64 of zigzag128) carrying the +// null marker, then (if non-null) varint(high 64). See +// detail::writeNullableInt128. +void decodeHugeintBatch( + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* flat = dst.asUnchecked>(); + auto* raw = flat->mutableRawValues(); + const auto N = static_cast(cursors.size()); + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + forEachLivePos(out, r, [&](uint32_t p) { + bool isNull{false}; + int128_t v{0}; + BOLT_USER_CHECK( + readNullableInt128(c.cur, c.end, isNull, v), + "DenseRow: malformed hugeint at row {}", + r); + flat->setNull(static_cast(p), isNull); + if (!isNull) { + raw[p] = v; + } + }); + } +} + +void decodeVarcharBatch( + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* flat = dst.asUnchecked>(); + // Write StringViews via raw pointer to avoid flat->set()'s extra memcpy + // for non-inline strings (the payload already lives in our buffer). + auto* rawValues = flat->mutableRawValues(); + char* buf{nullptr}; + size_t bufRemaining = 0; + const auto N = static_cast(cursors.size()); + + // Wire layout per row segment: length stream then payload stream. + // Decode must mirror that split. Stash per-slot length (or -1 for null) + // so the payload pass can do the placement. + folly::small_vector lengths; + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + lengths.clear(); + forEachLivePos(out, r, [&](uint32_t /*p*/) { + uint64_t v{0}; + BOLT_USER_CHECK( + readVarint(c.cur, c.end, v), + "DenseRow: malformed varchar length at row {}", + r); + lengths.push_back(v == 0 ? -1 : static_cast(v - 1)); + }); + + size_t idx = 0; + forEachLivePos(out, r, [&](uint32_t p) { + const int32_t len = lengths[idx++]; + if (len < 0) { + flat->setNull(static_cast(p), true); + return; + } + const auto ulen = static_cast(len); + BOLT_USER_CHECK( + static_cast(c.end - c.cur) >= ulen, + "DenseRow: truncated varchar payload at row {}", + r); + if (ulen <= StringView::kInlineSize) { + rawValues[p] = StringView(reinterpret_cast(c.cur), ulen); + } else { + if (bufRemaining < ulen) { + // Upper bound: bytes remaining in this row + every later non-null + // row's bytes. + size_t needed = static_cast(c.end - c.cur); + for (vector_size_t j = r + 1; j < N; ++j) { + if (rowNulls && bits::isBitNull(rowNulls, j)) { + continue; + } + needed += static_cast(cursors[j].end - cursors[j].cur); + } + buf = flat->getRawStringBufferWithSpace(needed, true); + bufRemaining = needed; + } + std::memcpy(buf, c.cur, ulen); + rawValues[p] = StringView(buf, ulen); + buf += ulen; + bufRemaining -= ulen; + } + c.cur += ulen; + }); + } +} + +void decodeTimestampBatch( + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* flat = dst.asUnchecked>(); + const auto N = static_cast(cursors.size()); + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + forEachLivePos(out, r, [&](uint32_t p) { + bool isNull{false}; + int64_t micros{0}; + BOLT_USER_CHECK( + readNullableInt64(c.cur, c.end, isNull, micros), + "DenseRow: malformed timestamp at row {}", + r); + if (isNull) { + flat->setNull(static_cast(p), true); + } else { + flat->setNull(static_cast(p), false); + flat->set( + static_cast(p), + Timestamp::fromMicrosNoError(micros)); + } + }); + } +} + +void decodeNullColumnBatch( + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + const auto N = static_cast(cursors.size()); + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + forEachLivePos(out, r, [&](uint32_t p) { + uint64_t v{0}; + BOLT_USER_CHECK( + readVarint(c.cur, c.end, v), + "DenseRow: malformed unknown-type marker at row {}", + r); + BOLT_USER_CHECK( + v == 0, "DenseRow: unknown-type expected null marker at row {}", r); + dst.setNull(static_cast(p), true); + }); + } +} + +// Pass 1 for ARRAY/MAP decode: read cardinality varints in row/parent-slot +// order. Allocates one (off, sz) child-slot entry per parent slot — null +// parent slots get (0, 0) entries as placeholders for index alignment with +// the parent-slot iteration. We also write the parent ArrayVector/MapVector +// offsets/sizes/nulls at this stage since the layout is already known. +// Callback `assign(pos, isNull, off, sz)` is passed as a template parameter so +// it inlines (no std::function indirect call per element in the decode hot +// loop). +template +void decodeArrayLikePass1( + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls, + vector_size_t childBase, + folly::small_vector& childSlots, + folly::small_vector& childBoundaries, + vector_size_t& totalChildren, + const char* what, + Assign assign) { + const auto N = static_cast(cursors.size()); + childBoundaries.resize(N + 1); + childBoundaries[0] = 0; + vector_size_t writeHead = childBase; + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + childBoundaries[r + 1] = static_cast(childSlots.size()); + continue; + } + RowCursor& c = cursors[r]; + forEachLivePos(out, r, [&](uint32_t p) { + uint64_t e{0}; + BOLT_USER_CHECK( + readVarint(c.cur, c.end, e), + "DenseRow: malformed {} cardinality at row {}", + what, + r); + if (e == 0) { + assign(static_cast(p), /*isNull=*/true, 0, 0); + } else { + // Bound the cardinality before the (narrowing) cast: each element + // consumes >= 1 byte further in this row's blob, so a cardinality + // larger than the bytes remaining for this row is corrupt input. + // Guards against overflowing writeHead/totalChildren and the + // subsequent child-vector resize on malformed wire. + const uint64_t card = e - 1; + BOLT_USER_CHECK_LE( + card, + static_cast(c.end - c.cur), + "DenseRow: {} cardinality {} exceeds remaining bytes at row {}", + what, + card, + r); + const auto sz = static_cast(card); + assign(static_cast(p), /*isNull=*/false, writeHead, sz); + if (sz > 0) { + childSlots.push_back( + {static_cast(writeHead), static_cast(sz)}); + } + writeHead += sz; + } + }); + childBoundaries[r + 1] = static_cast(childSlots.size()); + } + totalChildren = writeHead - childBase; +} + +void decodeArrayBatch( + const Type& type, + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* arr = dst.asUnchecked(); + auto& elements = *arr->elements(); + const vector_size_t childBase = elements.size(); + + folly::small_vector childSlots; + folly::small_vector childBoundaries; + vector_size_t totalChildren = 0; + decodeArrayLikePass1( + out, + cursors, + rowNulls, + childBase, + childSlots, + childBoundaries, + totalChildren, + "array", + [&](vector_size_t pos, bool isNull, vector_size_t off, vector_size_t sz) { + arr->setNull(pos, isNull); + arr->setOffsetAndSize(pos, off, sz); + }); + + elements.resize(childBase + totalChildren); + + SlotView childView{ + {childSlots.data(), childSlots.size()}, + {childBoundaries.data(), childBoundaries.size()}, + nullptr}; + decodeColumnBatch(*type.childAt(0), elements, childView, cursors, rowNulls); +} + +void decodeMapBatch( + const Type& type, + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + auto* m = dst.asUnchecked(); + auto& keys = *m->mapKeys(); + auto& values = *m->mapValues(); + const vector_size_t childBase = keys.size(); + + folly::small_vector childSlots; + folly::small_vector childBoundaries; + vector_size_t totalChildren = 0; + decodeArrayLikePass1( + out, + cursors, + rowNulls, + childBase, + childSlots, + childBoundaries, + totalChildren, + "map", + [&](vector_size_t pos, bool isNull, vector_size_t off, vector_size_t sz) { + m->setNull(pos, isNull); + m->setOffsetAndSize(pos, off, sz); + }); + + keys.resize(childBase + totalChildren); + values.resize(childBase + totalChildren); + + SlotView childView{ + {childSlots.data(), childSlots.size()}, + {childBoundaries.data(), childBoundaries.size()}, + nullptr}; + decodeColumnBatch(*type.childAt(0), keys, childView, cursors, rowNulls); + decodeColumnBatch(*type.childAt(1), values, childView, cursors, rowNulls); +} + +void decodeRowBatch( + const Type& type, + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls, + bool readMarker) { + auto* row = dst.asUnchecked(); + const auto N = static_cast(cursors.size()); + + vector_size_t bitmapBits = 0; + for (const auto& sr : out.slots) { + const auto endPos = static_cast(sr.base + sr.count); + if (endPos > bitmapBits) { + bitmapBits = endPos; + } + } + + std::vector childNullsBuf; + const uint64_t* childParentNulls = out.parentNulls; + if (bitmapBits > 0) { + childNullsBuf.assign(bits::nwords(bitmapBits), ~uint64_t{0}); + childParentNulls = childNullsBuf.data(); + } + + if (readMarker) { + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + RowCursor& c = cursors[r]; + const auto lo = out.rowBoundaries[r]; + const auto hi = out.rowBoundaries[r + 1]; + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = out.slots[i]; + const uint32_t endPos = sr.base + sr.count; + for (uint32_t p = sr.base; p < endPos; ++p) { + const bool parentSaysNull = out.parentNulls && + bits::isBitNull(out.parentNulls, static_cast(p)); + if (parentSaysNull) { + if (!childNullsBuf.empty()) { + bits::setNull(childNullsBuf.data(), p, true); + } + continue; + } + uint64_t v{0}; + BOLT_USER_CHECK( + readVarint(c.cur, c.end, v), + "DenseRow: malformed row null marker at row {}", + r); + if (v == 0) { + row->setNull(static_cast(p), true); + if (!childNullsBuf.empty()) { + bits::setNull(childNullsBuf.data(), p, true); + } + } else { + BOLT_USER_CHECK( + v == 1, + "DenseRow: invalid row null marker at row {}: {}", + r, + v); + row->setNull(static_cast(p), false); + } + } + } + } + } else { + // No marker on wire — caller asserts every position is non-null. + // Mirror the null-tracking the marker pass would have done for a + // non-null row so descended children see consistent parentNulls. + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + const auto lo = out.rowBoundaries[r]; + const auto hi = out.rowBoundaries[r + 1]; + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = out.slots[i]; + const uint32_t endPos = sr.base + sr.count; + for (uint32_t p = sr.base; p < endPos; ++p) { + const bool parentSaysNull = out.parentNulls && + bits::isBitNull(out.parentNulls, static_cast(p)); + if (parentSaysNull) { + if (!childNullsBuf.empty()) { + bits::setNull(childNullsBuf.data(), p, true); + } + continue; + } + row->setNull(static_cast(p), false); + } + } + } + } + + SlotView childView{out.slots, out.rowBoundaries, childParentNulls}; + const auto fieldCount = type.size(); + for (size_t f = 0; f < fieldCount; ++f) { + decodeColumnBatch( + *type.childAt(f), *row->childAt(f), childView, cursors, rowNulls); + } +} + +void decodeColumnBatch( + const Type& type, + BaseVector& dst, + SlotView out, + folly::Range cursors, + const uint64_t* rowNulls) { + switch (type.kind()) { + case TypeKind::BOOLEAN: + decodeBooleanBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::TINYINT: + decodeIntegerBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::SMALLINT: + decodeIntegerBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::INTEGER: + decodeIntegerBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::BIGINT: + decodeIntegerBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::REAL: + decodeRealBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::DOUBLE: + decodeDoubleBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::HUGEINT: + decodeHugeintBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + decodeVarcharBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::TIMESTAMP: + decodeTimestampBatch(dst, out, cursors, rowNulls); + return; + case TypeKind::ARRAY: + decodeArrayBatch(type, dst, out, cursors, rowNulls); + return; + case TypeKind::MAP: + decodeMapBatch(type, dst, out, cursors, rowNulls); + return; + case TypeKind::ROW: + decodeRowBatch(type, dst, out, cursors, rowNulls); + return; + case TypeKind::UNKNOWN: + decodeNullColumnBatch(dst, out, cursors, rowNulls); + return; + default: + BOLT_UNREACHABLE(); + } +} + +} // namespace bytedance::bolt::row::dense_row diff --git a/bolt/row/dense/DenseRowGeneralEncode.cpp b/bolt/row/dense/DenseRowGeneralEncode.cpp new file mode 100644 index 000000000..4c5f2c4a8 --- /dev/null +++ b/bolt/row/dense/DenseRowGeneralEncode.cpp @@ -0,0 +1,818 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// General (non-flat) column-batch encode kernels for the dense row format. See +// DenseRowGeneral.h. Kept in its own TU to isolate code layout from the decode +// kernels and the flat path; the two Sink instantiations the serializer needs +// are explicitly instantiated at the bottom. + +#include "bolt/row/dense/DenseRowGeneral.h" + +#include +#include +#include + +#include "bolt/common/base/Nulls.h" +#include "bolt/row/dense/IntVarint.h" +#include "bolt/vector/ComplexVector.h" +#include "bolt/vector/FlatVector.h" + +namespace bytedance::bolt::row::dense_row { + +using detail::nullableInt64SerializedSize; +using detail::varintSize; + +template +void encodeColumnBatch( + const Type& type, + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls); + +// Build the per-column plan tree by recursively decoding all nested vectors. +// Each node holds `source` — the vector it decoded — so the node is +// self-contained: its `decoded` (a non-owning view) and the held vector live +// together, independent of who else references the input. For nested types +// `children` holds the sub-plans (ARRAY -> {elements}, MAP -> {keys, values}, +// ROW -> fields). +// NOLINTNEXTLINE(misc-no-recursion) +ColumnPlan buildPlan(const TypePtr& type, const VectorPtr& vector) { + ColumnPlan plan; + plan.source = vector; + plan.kind = type->kind(); + plan.decoded.decode(*vector); + plan.mayHaveNulls = plan.decoded.mayHaveNulls(); + + switch (plan.kind) { + case TypeKind::ARRAY: { + const auto* array = plan.decoded.base()->as(); + BOLT_CHECK_NOT_NULL(array, "buildPlan: ARRAY base is not ArrayVector"); + plan.children.push_back(buildPlan(type->childAt(0), array->elements())); + return plan; + } + case TypeKind::MAP: { + const auto* map = plan.decoded.base()->as(); + BOLT_CHECK_NOT_NULL(map, "buildPlan: MAP base is not MapVector"); + plan.children.push_back(buildPlan(type->childAt(0), map->mapKeys())); + plan.children.push_back(buildPlan(type->childAt(1), map->mapValues())); + return plan; + } + case TypeKind::ROW: { + const auto* row = plan.decoded.base()->as(); + BOLT_CHECK_NOT_NULL(row, "buildPlan: ROW base is not RowVector"); + const auto& rowType = type->asRow(); + plan.children.reserve(rowType.size()); + + // For a dict/constant-wrapped ROW, push the outer mapping down onto each + // base child so its DecodedVector reads through the wrap. The wrapped + // child becomes that child node's `source`. The index buffer is reused + // from the input's own wrapInfo() for a single-level dictionary (no + // copy); for constant / multi-level it is materialized once (the resolved + // indices aren't a standalone input buffer). + BufferPtr outerIndices; + const auto outerSize = vector->size(); + if (!plan.decoded.isIdentityMapping()) { + if (vector->encoding() == VectorEncoding::Simple::DICTIONARY && + vector->wrapInfo()->as() == plan.decoded.indices()) { + outerIndices = vector->wrapInfo(); + } else { + outerIndices = + AlignedBuffer::allocate(outerSize, vector->pool()); + std::memcpy( + outerIndices->asMutable(), + plan.decoded.indices(), + outerSize * sizeof(vector_size_t)); + } + } + + for (size_t i = 0; i < rowType.size(); ++i) { + const auto& baseChild = row->childAt(i); + if (!baseChild) { + ColumnPlan nullPlan; + nullPlan.kind = TypeKind::UNKNOWN; + nullPlan.isNullColumn = true; + plan.children.push_back(std::move(nullPlan)); + continue; + } + if (outerIndices) { + plan.children.push_back(buildPlan( + rowType.childAt(i), + BaseVector::wrapInDictionary( + /*nulls=*/BufferPtr{nullptr}, + outerIndices, + outerSize, + baseChild))); + } else { + plan.children.push_back(buildPlan(rowType.childAt(i), baseChild)); + } + } + return plan; + } + default: + // Scalar leaves: nothing more to build. + return plan; + } +} + +// ============================================================================= +// Encode side +// ============================================================================= + +// Dedicated nullable-int encoder for any of int8/int16/int32/int64. Fast +// path (identity-mapped + no nulls + no parentNulls): walks each slot +// range as a contiguous int sequence and uses SIMD-batched varint sizing +// on the size pass, plus a tight scalar loop on the write pass. +template +void encodeIntegerBatchT( + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + const bool identity = plan.decoded.isIdentityMapping(); + const auto* raw = plan.decoded.data(); + const bool fastPath = identity && !mayNulls && in.parentNulls == nullptr; + + if (fastPath) { + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + const auto lo = in.rowBoundaries[r]; + const auto hi = in.rowBoundaries[r + 1]; + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = in.slots[i]; + if constexpr (std::is_same_v) { + s.bytes += detail::sumNullableIntSizes(raw + sr.base, sr.count); + } else { + const uint32_t end = sr.base + sr.count; + for (uint32_t p = sr.base; p < end; ++p) { + s.putNullableInt64(static_cast(raw[p]), false); + } + } + } + } + return; + } + + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t p) { + const bool isNull = mayNulls && plan.decoded.isNullAt(p); + const int64_t v = isNull + ? 0 + : static_cast( + identity ? raw[p] : raw[plan.decoded.index(p)]); + s.putNullableInt64(v, isNull); + }); + } +} + +// BOOLEAN: each non-null value emits exactly 1 byte (varint 1 or 2), null +// emits varint(0) = 1 byte. Total bytes per range = sr.count regardless +// of value distribution — size pass collapses to a single add. +template +void encodeBooleanBatch( + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + + if (in.parentNulls == nullptr) { + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + const auto lo = in.rowBoundaries[r]; + const auto hi = in.rowBoundaries[r + 1]; + if constexpr (std::is_same_v) { + size_t total = 0; + for (uint32_t i = lo; i < hi; ++i) { + total += in.slots[i].count; + } + s.bytes += total; + } else { + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = in.slots[i]; + const uint32_t end = sr.base + sr.count; + for (uint32_t p = sr.base; p < end; ++p) { + if (mayNulls && plan.decoded.isNullAt(p)) { + s.putVarint(0); + } else { + s.putVarint(plan.decoded.valueAt(p) ? 2 : 1); + } + } + } + } + } + return; + } + + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t p) { + if (mayNulls && plan.decoded.isNullAt(p)) { + s.putVarint(0); + } else { + s.putVarint(plan.decoded.valueAt(p) ? 2 : 1); + } + }); + } +} + +template +void encodeRealBatch( + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + const bool identity = plan.decoded.isIdentityMapping(); + const auto* raw = plan.decoded.data(); + const bool fastPath = identity && !mayNulls && in.parentNulls == nullptr; + + if (fastPath) { + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + const auto lo = in.rowBoundaries[r]; + const auto hi = in.rowBoundaries[r + 1]; + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = in.slots[i]; + if constexpr (std::is_same_v) { + s.bytes += static_cast(sr.count) * sizeof(uint32_t); + } else { + const uint32_t end = sr.base + sr.count; + for (uint32_t p = sr.base; p < end; ++p) { + uint32_t b; + std::memcpy(&b, raw + p, sizeof(b)); + if (FOLLY_UNLIKELY(b == kNullFloatBits)) { + b ^= 1u; + } + s.template putFixed(b); + } + } + } + } + return; + } + + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t p) { + if (mayNulls && plan.decoded.isNullAt(p)) { + s.template putFixed(kNullFloatBits); + return; + } + const float value = plan.decoded.valueAt(p); + uint32_t b; + std::memcpy(&b, &value, sizeof(b)); + // Match v1 collision policy: bit-flip the rare value that aliases + // the null sentinel. Inputs whose bits already equal kNullFloatBits^1 + // round-trip through a single-bit corruption — same lossy behavior + // as the v1 wire format. + if (FOLLY_UNLIKELY(b == kNullFloatBits)) { + b ^= 1u; + } + s.template putFixed(b); + }); + } +} + +template +void encodeDoubleBatch( + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + const bool identity = plan.decoded.isIdentityMapping(); + const auto* raw = plan.decoded.data(); + const bool fastPath = identity && !mayNulls && in.parentNulls == nullptr; + + if (fastPath) { + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + const auto lo = in.rowBoundaries[r]; + const auto hi = in.rowBoundaries[r + 1]; + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = in.slots[i]; + if constexpr (std::is_same_v) { + s.bytes += static_cast(sr.count) * sizeof(uint64_t); + } else { + const uint32_t end = sr.base + sr.count; + for (uint32_t p = sr.base; p < end; ++p) { + uint64_t b; + std::memcpy(&b, raw + p, sizeof(b)); + if (FOLLY_UNLIKELY(b == kNullDoubleBits)) { + b ^= 1ull; + } + s.template putFixed(b); + } + } + } + } + return; + } + + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t p) { + if (mayNulls && plan.decoded.isNullAt(p)) { + s.template putFixed(kNullDoubleBits); + return; + } + const double value = plan.decoded.valueAt(p); + uint64_t b; + std::memcpy(&b, &value, sizeof(b)); + if (FOLLY_UNLIKELY(b == kNullDoubleBits)) { + b ^= 1ull; + } + s.template putFixed(b); + }); + } +} + +template +void encodeVarcharBatch( + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + // Wire layout per row segment: length stream then payload stream. + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t p) { + if (mayNulls && plan.decoded.isNullAt(p)) { + s.putVarint(0); + } else { + s.putVarint( + static_cast(plan.decoded.valueAt(p).size()) + + 1); + } + }); + forEachLivePos(in, r, [&](uint32_t p) { + if (mayNulls && plan.decoded.isNullAt(p)) { + return; + } + const auto sv = plan.decoded.valueAt(p); + s.putRaw(sv.data(), sv.size()); + }); + } +} + +template +void encodeTimestampBatch( + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t p) { + const bool isNull = mayNulls && plan.decoded.isNullAt(p); + s.putNullableInt64( + isNull ? 0 : plan.decoded.valueAt(p).toMicros(), isNull); + }); + } +} + +// HUGEINT wire format: the null marker is folded into the low int64 slot (no +// separate presence tag). null -> nullableInt64(_, null) (a single 0x00 byte); +// non-null -> nullableInt64(low 64 of zigzag128(value)), varint(high 64). +// Small/ medium DECIMAL unscaled values encode in a few bytes (vs the old fixed +// 16). Mirrors detail::writeNullableInt128. +template +void encodeHugeintBatch( + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t p) { + if (mayNulls && plan.decoded.isNullAt(p)) { + s.putNullableInt64(0, /*isNull=*/true); + } else { + const int128_t v = plan.decoded.valueAt(p); + const auto zz = detail::zigZagEncode128(v); + s.putNullableInt64( + static_cast(static_cast(zz)), /*isNull=*/false); + s.putVarint(static_cast(zz >> 64)); + } + }); + } +} + +template +void encodeNullColumnBatch( + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto N = static_cast(sinks.size()); + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t /*p*/) { s.putVarint(0); }); + } +} + +// ARRAY/MAP cardinality emission + (conditionally) child slot tree build. +// +// For SizeSink: emits per-slot cardinality varint *sizes* and builds the +// child SlotRange/boundaries arrays in `node`. +// For WriteSink: emits per-slot cardinality *bytes* and skips the build — +// `node` was already populated by the prior SizeSink walk. +// +// Walks parent positions identically in both passes (the cardinality stream +// must be byte-for-byte identical between size and write), but the +// push_back work happens only on the size pass. This roughly halves the +// per-call slot-tree overhead vs rebuilding on each pass. +template +void encodeArrayLikeCardinalities( + const ColumnPlan& plan, + const vector_size_t* rawOffsets, + const vector_size_t* rawSizes, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls, + SlotTreeNode& node) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + const bool identity = plan.decoded.isIdentityMapping(); + constexpr bool kBuild = std::is_same_v; + + if constexpr (kBuild) { + node.slots.clear(); + node.boundaries.clear(); + node.boundaries.resize(N + 1); + node.boundaries[0] = 0; + } + + // Fast path: no nulls on parent vector, no parentNulls bitmap, identity + // mapping. Inline the hot loop (cardinality emit + maybe push). + if (!mayNulls && identity && in.parentNulls == nullptr) { + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + if constexpr (kBuild) { + node.boundaries[r + 1] = static_cast(node.slots.size()); + } + continue; + } + Sink& s = sinks[r]; + const auto lo = in.rowBoundaries[r]; + const auto hi = in.rowBoundaries[r + 1]; + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = in.slots[i]; + const uint32_t end = sr.base + sr.count; + for (uint32_t p = sr.base; p < end; ++p) { + const auto sz = static_cast(rawSizes[p]); + s.putVarint(static_cast(sz) + 1); + if constexpr (kBuild) { + node.slots.push_back({static_cast(rawOffsets[p]), sz}); + } + } + } + if constexpr (kBuild) { + node.boundaries[r + 1] = static_cast(node.slots.size()); + } + } + return; + } + + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + if constexpr (kBuild) { + node.boundaries[r + 1] = static_cast(node.slots.size()); + } + continue; + } + Sink& s = sinks[r]; + forEachLivePos(in, r, [&](uint32_t p) { + if (mayNulls && plan.decoded.isNullAt(p)) { + s.putVarint(0); + return; + } + const auto idx = + identity ? p : plan.decoded.index(static_cast(p)); + const auto sz = static_cast(rawSizes[idx]); + s.putVarint(static_cast(sz) + 1); + if constexpr (kBuild) { + node.slots.push_back({static_cast(rawOffsets[idx]), sz}); + } + }); + if constexpr (kBuild) { + node.boundaries[r + 1] = static_cast(node.slots.size()); + } + } +} + +template +void encodeArrayBatch( + const Type& type, + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto* array = plan.decoded.base()->as(); + // This level owns its child slots in the plan tree: the SizeSink pass builds + // them, the WriteSink pass reads them straight back (no cursor/scratch). + SlotTreeNode& node = plan.childSlots; + if constexpr (std::is_same_v) { + // Upper bound: total non-null parent slots ≤ elements vector size. + node.slots.reserve(array->elements()->size()); + } + encodeArrayLikeCardinalities( + plan, array->rawOffsets(), array->rawSizes(), in, sinks, rowNulls, node); + SlotView childView{ + {node.slots.data(), node.slots.size()}, + {node.boundaries.data(), node.boundaries.size()}, + nullptr}; + encodeColumnBatch( + *type.childAt(0), plan.children[0], childView, sinks, rowNulls); +} + +template +void encodeMapBatch( + const Type& type, + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + const auto* map = plan.decoded.base()->as(); + // One child slot set (built by the SizeSink pass) drives both keys and + // values. + SlotTreeNode& node = plan.childSlots; + if constexpr (std::is_same_v) { + node.slots.reserve(map->mapKeys()->size()); + } + encodeArrayLikeCardinalities( + plan, map->rawOffsets(), map->rawSizes(), in, sinks, rowNulls, node); + SlotView childView{ + {node.slots.data(), node.slots.size()}, + {node.boundaries.data(), node.boundaries.size()}, + nullptr}; + encodeColumnBatch( + *type.childAt(0), plan.children[0], childView, sinks, rowNulls); + encodeColumnBatch( + *type.childAt(1), plan.children[1], childView, sinks, rowNulls); +} + +template +void encodeRowBatch( + const Type& type, + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls, + bool emitMarker) { + const auto N = static_cast(sinks.size()); + const bool mayNulls = plan.mayHaveNulls; + + // The subfield slot range equals this ROW's slot range — same positions, + // filtered by combined ancestor + this-level nulls. Materialize a bitmap + // only if there's anything to filter. + const bool needBitmap = mayNulls || in.parentNulls != nullptr; + vector_size_t bitmapBits = 0; + if (needBitmap) { + for (const auto& sr : in.slots) { + const auto endPos = static_cast(sr.base + sr.count); + if (endPos > bitmapBits) { + bitmapBits = endPos; + } + } + } + + std::vector childNullsBuf; + const uint64_t* childParentNulls = in.parentNulls; + if (needBitmap && bitmapBits > 0) { + childNullsBuf.assign(bits::nwords(bitmapBits), ~uint64_t{0}); + childParentNulls = childNullsBuf.data(); + } + + // Fast path: no nulls anywhere AND no parentNulls. Every position emits + // varint(1) = 1 byte and contributes nothing to childNullsBuf. SizeSink + // collapses to a bulk count add; WriteSink writes 1 byte per slot. + // When emitMarker is false (top-level non-null contract from caller via + // serializeAt), the marker step is skipped entirely; nested ROW levels + // always call this with the default true. + if (!mayNulls && in.parentNulls == nullptr) { + if (emitMarker) { + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + const auto lo = in.rowBoundaries[r]; + const auto hi = in.rowBoundaries[r + 1]; + if constexpr (std::is_same_v) { + size_t total = 0; + for (uint32_t i = lo; i < hi; ++i) { + total += in.slots[i].count; + } + s.bytes += total; + } else { + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = in.slots[i]; + const uint32_t endPos = sr.base + sr.count; + for (uint32_t p = sr.base; p < endPos; ++p) { + (void)p; + s.putVarint(1); + } + } + } + } + } + SlotView childView{in.slots, in.rowBoundaries, childParentNulls}; + const auto fieldCount = type.size(); + for (size_t f = 0; f < fieldCount; ++f) { + encodeColumnBatch( + *type.childAt(f), plan.children[f], childView, sinks, rowNulls); + } + return; + } + + for (vector_size_t r = 0; r < N; ++r) { + if (rowNulls && bits::isBitNull(rowNulls, r)) { + continue; + } + Sink& s = sinks[r]; + const auto lo = in.rowBoundaries[r]; + const auto hi = in.rowBoundaries[r + 1]; + for (uint32_t i = lo; i < hi; ++i) { + const auto& sr = in.slots[i]; + const uint32_t endPos = sr.base + sr.count; + for (uint32_t p = sr.base; p < endPos; ++p) { + const bool parentSaysNull = in.parentNulls && + bits::isBitNull(in.parentNulls, static_cast(p)); + if (parentSaysNull) { + if (!childNullsBuf.empty()) { + bits::setNull(childNullsBuf.data(), p, true); + } + continue; + } + if (mayNulls && plan.decoded.isNullAt(p)) { + s.putVarint(0); + if (!childNullsBuf.empty()) { + bits::setNull(childNullsBuf.data(), p, true); + } + } else { + s.putVarint(1); + } + } + } + } + + SlotView childView{in.slots, in.rowBoundaries, childParentNulls}; + const auto fieldCount = type.size(); + for (size_t f = 0; f < fieldCount; ++f) { + encodeColumnBatch( + *type.childAt(f), plan.children[f], childView, sinks, rowNulls); + } +} + +template +void encodeColumnBatch( + const Type& type, + const ColumnPlan& plan, + SlotView in, + folly::Range sinks, + const uint64_t* rowNulls) { + if (plan.isNullColumn) { + encodeNullColumnBatch(in, sinks, rowNulls); + return; + } + switch (plan.kind) { + case TypeKind::BOOLEAN: + encodeBooleanBatch(plan, in, sinks, rowNulls); + return; + case TypeKind::TINYINT: + encodeIntegerBatchT(plan, in, sinks, rowNulls); + return; + case TypeKind::SMALLINT: + encodeIntegerBatchT(plan, in, sinks, rowNulls); + return; + case TypeKind::INTEGER: + encodeIntegerBatchT(plan, in, sinks, rowNulls); + return; + case TypeKind::BIGINT: + encodeIntegerBatchT(plan, in, sinks, rowNulls); + return; + case TypeKind::REAL: + encodeRealBatch(plan, in, sinks, rowNulls); + return; + case TypeKind::DOUBLE: + encodeDoubleBatch(plan, in, sinks, rowNulls); + return; + case TypeKind::HUGEINT: + encodeHugeintBatch(plan, in, sinks, rowNulls); + return; + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: + encodeVarcharBatch(plan, in, sinks, rowNulls); + return; + case TypeKind::TIMESTAMP: + encodeTimestampBatch(plan, in, sinks, rowNulls); + return; + case TypeKind::ARRAY: + encodeArrayBatch(type, plan, in, sinks, rowNulls); + return; + case TypeKind::MAP: + encodeMapBatch(type, plan, in, sinks, rowNulls); + return; + case TypeKind::ROW: + encodeRowBatch(type, plan, in, sinks, rowNulls); + return; + case TypeKind::UNKNOWN: + encodeNullColumnBatch(in, sinks, rowNulls); + return; + default: + BOLT_UNREACHABLE(); + } +} + +// Explicit instantiations for the two passes (size + write). Keeping the kernel +// bodies in this TU (not a header) is what isolates their code layout. +template void encodeColumnBatch( + const Type&, + const ColumnPlan&, + SlotView, + folly::Range, + const uint64_t*); +template void encodeColumnBatch( + const Type&, + const ColumnPlan&, + SlotView, + folly::Range, + const uint64_t*); +template void encodeRowBatch( + const Type&, + const ColumnPlan&, + SlotView, + folly::Range, + const uint64_t*, + bool); +template void encodeRowBatch( + const Type&, + const ColumnPlan&, + SlotView, + folly::Range, + const uint64_t*, + bool); + +} // namespace bytedance::bolt::row::dense_row diff --git a/bolt/row/dense/DenseRowScalar.h b/bolt/row/dense/DenseRowScalar.h new file mode 100644 index 000000000..266a95428 --- /dev/null +++ b/bolt/row/dense/DenseRowScalar.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include "bolt/vector/BaseVector.h" +#include "bolt/vector/DecodedVector.h" + +namespace bytedance::bolt::row::dense_row { +struct RowCursor; +} + +// Scalar column fast path: a scalar-typed (non ARRAY/MAP/ROW) top-level column +// has a trivial per-row wire layout `[v]` — no row marker, no cardinality +// cards, no parent-null filtering — so it is encoded/decoded column-at-a-time, +// skipping the SlotView machinery entirely. DenseRow routes each scalar +// top-level field here and each complex field to the general path +// (DenseRowGeneral.h). +namespace bytedance::bolt::row::dense_row::scalar { + +// Column-at-a-time size accumulation: adds field `dec`'s per-row byte counts +// into rowSizes[0..N). +void addColumnSizes( + const Type& type, + const DecodedVector& dec, + vector_size_t N, + size_t* rowSizes); + +// Column-at-a-time write: appends field `dec`'s bytes through per-row cursors +// rowCursors[0..N), advancing each. +void writeColumn( + const Type& type, + const DecodedVector& dec, + vector_size_t N, + uint8_t** rowCursors); + +// Column-at-a-time read: decodes one scalar value per row from cursors[0..N) +// into `dst`, advancing each cursor. Inverse of writeColumn +void readColumn( + const Type& type, + BaseVector& dst, + vector_size_t N, + folly::Range cursors); + +} // namespace bytedance::bolt::row::dense_row::scalar diff --git a/bolt/row/dense/DenseRowScalarDecode.cpp b/bolt/row/dense/DenseRowScalarDecode.cpp new file mode 100644 index 000000000..d2b0066e5 --- /dev/null +++ b/bolt/row/dense/DenseRowScalarDecode.cpp @@ -0,0 +1,262 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Scalar-row fast path — DECODE side (read). See DenseRowScalar.h. +// +// Column-at-a-time, row-major reads: for each scalar field, walk the N per-row +// cursors and decode one value from each. This is already the fast shape — it +// skips the SlotView machinery the general decoder uses — and there is no SIMD +// batch decode because varint parsing is inherently sequential. The varint +// readers (readVarint / readNullableInt64) use the BMI2 short-fast-path. The +// input is always marker-less with no top-level null rows, so there is no +// per-row row-null filtering here. +// +// Split from the encode side so the two layout-sensitive scalar hot paths do +// not perturb each other's code layout. + +#include "bolt/row/dense/DenseRowScalar.h" + +#include +#include +#include +#include + +#include "bolt/row/dense/DenseRowGeneral.h" +#include "bolt/row/dense/IntVarint.h" +#include "bolt/vector/ComplexVector.h" +#include "bolt/vector/FlatVector.h" + +namespace bytedance::bolt::row::dense_row::scalar { + +template +void readIntColumn( + FlatVector* flat, + vector_size_t N, + folly::Range cursors) { + auto* raw = flat->mutableRawValues(); + for (vector_size_t r = 0; r < N; ++r) { + RowCursor& c = cursors[r]; + bool isNull{false}; + int64_t v{0}; + [[maybe_unused]] const bool ok = readNullableInt64(c.cur, c.end, isNull, v); + BOLT_DCHECK(ok, "DenseRow: malformed integer value at row {}", r); + if (isNull) { + flat->setNull(r, true); + } else { + if constexpr (!std::is_same_v) { + BOLT_DCHECK( + v >= static_cast(std::numeric_limits::min()) && + v <= static_cast(std::numeric_limits::max()), + "DenseRow: integer value out of range at row {}: {}", + r, + v); + } + flat->setNull(r, false); + raw[r] = static_cast(v); + } + } +} + +void readColumn( + const Type& type, + BaseVector& dst, + vector_size_t N, + folly::Range cursors) { + switch (type.kind()) { + case TypeKind::BOOLEAN: { + auto* flat = dst.asUnchecked>(); + for (vector_size_t r = 0; r < N; ++r) { + RowCursor& c = cursors[r]; + uint64_t v{0}; + [[maybe_unused]] const bool ok = readVarint(c.cur, c.end, v); + BOLT_DCHECK(ok, "DenseRow: malformed boolean at row {}", r); + if (v == 0) { + flat->setNull(r, true); + } else { + BOLT_DCHECK( + v <= 2, "DenseRow: invalid boolean encoding at row {}: {}", r, v); + flat->setNull(r, false); + flat->set(r, v == 2); + } + } + return; + } + case TypeKind::UNKNOWN: + for (vector_size_t r = 0; r < N; ++r) { + RowCursor& c = cursors[r]; + uint64_t v{0}; + [[maybe_unused]] const bool ok = readVarint(c.cur, c.end, v); + BOLT_DCHECK(ok, "DenseRow: malformed unknown-type marker at row {}", r); + BOLT_DCHECK( + v == 0, "DenseRow: unknown-type expected null marker at row {}", r); + dst.setNull(r, true); + } + return; + case TypeKind::TINYINT: + readIntColumn(dst.asUnchecked>(), N, cursors); + return; + case TypeKind::SMALLINT: + readIntColumn( + dst.asUnchecked>(), N, cursors); + return; + case TypeKind::INTEGER: + readIntColumn( + dst.asUnchecked>(), N, cursors); + return; + case TypeKind::BIGINT: + readIntColumn( + dst.asUnchecked>(), N, cursors); + return; + case TypeKind::REAL: { + auto* flat = dst.asUnchecked>(); + auto* raw = flat->mutableRawValues(); + for (vector_size_t r = 0; r < N; ++r) { + RowCursor& c = cursors[r]; + BOLT_DCHECK( + static_cast(c.end - c.cur) >= sizeof(uint32_t), + "DenseRow: truncated real at row {}", + r); + uint32_t b; + std::memcpy(&b, c.cur, sizeof(b)); + c.cur += sizeof(b); + if (b == kNullFloatBits) { + flat->setNull(r, true); + } else { + flat->setNull(r, false); + if (FOLLY_UNLIKELY(b == (kNullFloatBits ^ 1u))) { + b ^= 1u; + } + std::memcpy(raw + r, &b, sizeof(b)); + } + } + return; + } + case TypeKind::DOUBLE: { + auto* flat = dst.asUnchecked>(); + auto* raw = flat->mutableRawValues(); + for (vector_size_t r = 0; r < N; ++r) { + RowCursor& c = cursors[r]; + BOLT_DCHECK( + static_cast(c.end - c.cur) >= sizeof(uint64_t), + "DenseRow: truncated double at row {}", + r); + uint64_t b; + std::memcpy(&b, c.cur, sizeof(b)); + c.cur += sizeof(b); + if (b == kNullDoubleBits) { + flat->setNull(r, true); + } else { + flat->setNull(r, false); + if (FOLLY_UNLIKELY(b == (kNullDoubleBits ^ 1ull))) { + b ^= 1ull; + } + std::memcpy(raw + r, &b, sizeof(b)); + } + } + return; + } + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: { + auto* flat = dst.asUnchecked>(); + auto* rawValues = flat->mutableRawValues(); + // Inline values (<= kInlineSize) live in the StringView itself; longer + // ones are copied into the vector's string buffer, carved from fixed-size + // chunks (a string larger than a chunk gets its own exact allocation). + // Fixed chunks avoid both a look-ahead scan of the remaining cursor bytes + // and a getRawStringBufferWithSpace call per non-inline value. + constexpr size_t kStringChunk = 32 * 1024; + char* heap = nullptr; + size_t heapRemaining = 0; + for (vector_size_t r = 0; r < N; ++r) { + RowCursor& c = cursors[r]; + uint64_t lenPlus{0}; + // Always-on: the length and payload-bounds guards gate a memcpy of a + // wire-controlled length, so a corrupt/truncated value must fail loudly + // rather than over-read the input buffer (matches the general decoder). + BOLT_USER_CHECK( + readVarint(c.cur, c.end, lenPlus), + "DenseRow: malformed varchar length at row {}", + r); + if (lenPlus == 0) { + flat->setNull(r, true); + continue; + } + const auto len = static_cast(lenPlus - 1); + BOLT_USER_CHECK( + static_cast(c.end - c.cur) >= len, + "DenseRow: truncated varchar payload at row {}", + r); + flat->setNull(r, false); + if (len <= StringView::kInlineSize) { + rawValues[r] = StringView(reinterpret_cast(c.cur), len); + } else { + if (heapRemaining < len) { + const size_t alloc = len > kStringChunk ? len : kStringChunk; + heap = flat->getRawStringBufferWithSpace(alloc, /*exactSize=*/true); + heapRemaining = alloc; + } + std::memcpy(heap, c.cur, len); + rawValues[r] = StringView(heap, len); + heap += len; + heapRemaining -= len; + } + c.cur += len; + } + return; + } + case TypeKind::TIMESTAMP: { + auto* flat = dst.asUnchecked>(); + for (vector_size_t r = 0; r < N; ++r) { + RowCursor& c = cursors[r]; + bool isNull{false}; + int64_t micros{0}; + [[maybe_unused]] const bool ok = + readNullableInt64(c.cur, c.end, isNull, micros); + BOLT_DCHECK(ok, "DenseRow: malformed timestamp at row {}", r); + if (isNull) { + flat->setNull(r, true); + } else { + flat->setNull(r, false); + flat->set(r, Timestamp::fromMicrosNoError(micros)); + } + } + return; + } + case TypeKind::HUGEINT: { + auto* flat = dst.asUnchecked>(); + auto* raw = flat->mutableRawValues(); + for (vector_size_t r = 0; r < N; ++r) { + RowCursor& c = cursors[r]; + bool isNull{false}; + int128_t v{0}; + [[maybe_unused]] const bool ok = + readNullableInt128(c.cur, c.end, isNull, v); + BOLT_DCHECK(ok, "DenseRow: malformed hugeint at row {}", r); + if (isNull) { + flat->setNull(r, true); + } else { + flat->setNull(r, false); + raw[r] = v; + } + } + return; + } + default: + BOLT_UNREACHABLE(); + } +} + +} // namespace bytedance::bolt::row::dense_row::scalar diff --git a/bolt/row/dense/DenseRowScalarEncode.cpp b/bolt/row/dense/DenseRowScalarEncode.cpp new file mode 100644 index 000000000..799f2f2ae --- /dev/null +++ b/bolt/row/dense/DenseRowScalarEncode.cpp @@ -0,0 +1,289 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Scalar-row fast path — ENCODE side (size + write). See +// DenseRowScalar.h. Split from the decode side so the two layout-sensitive +// scalar hot paths do not perturb each other's code layout. + +#include "bolt/row/dense/DenseRowScalar.h" + +#include + +#include "bolt/row/dense/DenseRowGeneral.h" +#include "bolt/row/dense/IntVarint.h" + +namespace bytedance::bolt::row::dense_row::scalar { + +template +FOLLY_ALWAYS_INLINE void +addIntColumnSizes(const DecodedVector& dec, vector_size_t N, size_t* rowSizes) { + const auto* raw = dec.data(); + if (dec.isIdentityMapping()) { + // Flat column. A null rawNulls() means no nulls (mayHaveNulls() is only a + // conservative upper bound), so this also covers the "may-null flag set + // without a backing bitmap" case. All int widths handled: int64 directly, + // int8/int16/int32 via int32. + const uint64_t* nulls = + dec.mayHaveNulls() ? dec.base()->rawNulls() : nullptr; + if (nulls) { + // SIMD value sizes + branchless null override via the row-indexed + // validity bitmap. + detail::addNullableIntColumnSizes( + raw, nulls, rowSizes, static_cast(N)); + } else { + // Contiguous, no value-nulls: portable xsimd size kernel scattered + // per row. + detail::addNoNullIntColumnSizes(raw, rowSizes, static_cast(N)); + } + return; + } + if (dec.isConstantMapping()) { + // Every row maps to the same base value (or all rows are null), so the + // serialized size is identical — compute it once and splat across rows. + const bool isNull = dec.mayHaveNulls() && dec.isNullAt(0); + const int64_t v = isNull ? 0 : static_cast(raw[dec.index(0)]); + const size_t sz = detail::nullableInt64SerializedSize(v, isNull); + for (vector_size_t r = 0; r < N; ++r) { + rowSizes[r] += sz; + } + return; + } + // General path: dictionary mappings. + const bool mayNulls = dec.mayHaveNulls(); + for (vector_size_t r = 0; r < N; ++r) { + const bool isNull = mayNulls && dec.isNullAt(r); + const int64_t v = isNull ? 0 : static_cast(raw[dec.index(r)]); + rowSizes[r] += detail::nullableInt64SerializedSize(v, isNull); + } +} + +template +FOLLY_ALWAYS_INLINE void addFixedColumnSizes( + vector_size_t N, + size_t* rowSizes) { + for (vector_size_t r = 0; r < N; ++r) { + rowSizes[r] += KBYTES; + } +} + +void addColumnSizes( + const Type& type, + const DecodedVector& dec, + vector_size_t N, + size_t* rowSizes) { + switch (type.kind()) { + case TypeKind::BOOLEAN: + case TypeKind::UNKNOWN: + addFixedColumnSizes<1>(N, rowSizes); + return; + case TypeKind::TINYINT: + addIntColumnSizes(dec, N, rowSizes); + return; + case TypeKind::SMALLINT: + addIntColumnSizes(dec, N, rowSizes); + return; + case TypeKind::INTEGER: + addIntColumnSizes(dec, N, rowSizes); + return; + case TypeKind::BIGINT: + addIntColumnSizes(dec, N, rowSizes); + return; + case TypeKind::REAL: + addFixedColumnSizes(N, rowSizes); + return; + case TypeKind::DOUBLE: + addFixedColumnSizes(N, rowSizes); + return; + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: { + const bool mayNulls = dec.mayHaveNulls(); + for (vector_size_t r = 0; r < N; ++r) { + if (mayNulls && dec.isNullAt(r)) { + ++rowSizes[r]; + } else { + const auto len = dec.valueAt(r).size(); + rowSizes[r] += + detail::varintSize(static_cast(len) + 1) + len; + } + } + return; + } + case TypeKind::TIMESTAMP: { + const bool mayNulls = dec.mayHaveNulls(); + for (vector_size_t r = 0; r < N; ++r) { + if (mayNulls && dec.isNullAt(r)) { + ++rowSizes[r]; + } else { + rowSizes[r] += detail::nullableInt64SerializedSize( + dec.valueAt(r).toMicros(), false); + } + } + return; + } + case TypeKind::HUGEINT: { + // null folded into the low slot: nullableInt64(zigzag128 low 64), then + // varint(high 64) if non-null. See detail::writeNullableInt128. + const bool mayNulls = dec.mayHaveNulls(); + for (vector_size_t r = 0; r < N; ++r) { + const bool isNull = mayNulls && dec.isNullAt(r); + rowSizes[r] += detail::nullableInt128SerializedSize( + isNull ? int128_t{0} : dec.valueAt(r), isNull); + } + return; + } + default: + BOLT_UNREACHABLE(); + } +} + +template +FOLLY_ALWAYS_INLINE void writeIntColumn( + const DecodedVector& dec, + vector_size_t N, + uint8_t** rowCursors) { + const bool mayNulls = dec.mayHaveNulls(); + const bool identity = dec.isIdentityMapping(); + const auto* raw = dec.data(); + for (vector_size_t r = 0; r < N; ++r) { + const bool isNull = mayNulls && dec.isNullAt(r); + const int64_t v = isNull + ? 0 + : static_cast(identity ? raw[r] : raw[dec.index(r)]); + rowCursors[r] = writeNullableInt64(v, isNull, rowCursors[r]); + } +} + +void writeColumn( + const Type& type, + const DecodedVector& dec, + vector_size_t N, + uint8_t** rowCursors) { + switch (type.kind()) { + case TypeKind::BOOLEAN: { + const bool mayNulls = dec.mayHaveNulls(); + for (vector_size_t r = 0; r < N; ++r) { + if (mayNulls && dec.isNullAt(r)) { + *rowCursors[r]++ = uint8_t{0}; + } else { + *rowCursors[r]++ = dec.valueAt(r) ? uint8_t{2} : uint8_t{1}; + } + } + return; + } + case TypeKind::UNKNOWN: + for (vector_size_t r = 0; r < N; ++r) { + *rowCursors[r]++ = uint8_t{0}; + } + return; + case TypeKind::TINYINT: + writeIntColumn(dec, N, rowCursors); + return; + case TypeKind::SMALLINT: + writeIntColumn(dec, N, rowCursors); + return; + case TypeKind::INTEGER: + writeIntColumn(dec, N, rowCursors); + return; + case TypeKind::BIGINT: + writeIntColumn(dec, N, rowCursors); + return; + case TypeKind::REAL: { + const bool mayNulls = dec.mayHaveNulls(); + const bool identity = dec.isIdentityMapping(); + const auto* raw = dec.data(); + for (vector_size_t r = 0; r < N; ++r) { + uint32_t b; + if (mayNulls && dec.isNullAt(r)) { + b = kNullFloatBits; + } else { + const float value = identity ? raw[r] : raw[dec.index(r)]; + std::memcpy(&b, &value, sizeof(b)); + // kNullFloatBits is the canonical quiet NaN. Flipping the low + // mantissa bit yields another NaN. + if (FOLLY_UNLIKELY(b == kNullFloatBits)) { + b ^= 1u; + } + } + std::memcpy(rowCursors[r], &b, sizeof(b)); + rowCursors[r] += sizeof(b); + } + return; + } + case TypeKind::DOUBLE: { + const bool mayNulls = dec.mayHaveNulls(); + const bool identity = dec.isIdentityMapping(); + const auto* raw = dec.data(); + for (vector_size_t r = 0; r < N; ++r) { + uint64_t b; + if (mayNulls && dec.isNullAt(r)) { + b = kNullDoubleBits; + } else { + const double value = identity ? raw[r] : raw[dec.index(r)]; + std::memcpy(&b, &value, sizeof(b)); + // kNullDoubleBits is the canonical quiet NaN. Flipping the low + // mantissa bit yields another NaN. + if (FOLLY_UNLIKELY(b == kNullDoubleBits)) { + b ^= 1ull; + } + } + std::memcpy(rowCursors[r], &b, sizeof(b)); + rowCursors[r] += sizeof(b); + } + return; + } + case TypeKind::VARCHAR: + case TypeKind::VARBINARY: { + const bool mayNulls = dec.mayHaveNulls(); + for (vector_size_t r = 0; r < N; ++r) { + uint8_t* out = rowCursors[r]; + if (mayNulls && dec.isNullAt(r)) { + out = writeVarint(0, out); + } else { + const auto sv = dec.valueAt(r); + out = writeVarint(static_cast(sv.size()) + 1, out); + std::memcpy(out, sv.data(), sv.size()); + out += sv.size(); + } + rowCursors[r] = out; + } + return; + } + case TypeKind::TIMESTAMP: { + const bool mayNulls = dec.mayHaveNulls(); + for (vector_size_t r = 0; r < N; ++r) { + const bool isNull = mayNulls && dec.isNullAt(r); + const int64_t v = isNull ? 0 : dec.valueAt(r).toMicros(); + rowCursors[r] = writeNullableInt64(v, isNull, rowCursors[r]); + } + return; + } + case TypeKind::HUGEINT: { + const bool mayNulls = dec.mayHaveNulls(); + for (vector_size_t r = 0; r < N; ++r) { + const bool isNull = mayNulls && dec.isNullAt(r); + rowCursors[r] = writeNullableInt128( + isNull ? int128_t{0} : dec.valueAt(r), + isNull, + rowCursors[r]); + } + return; + } + default: + BOLT_UNREACHABLE(); + } +} + +} // namespace bytedance::bolt::row::dense_row::scalar diff --git a/bolt/row/dense/IntVarint.h b/bolt/row/dense/IntVarint.h new file mode 100644 index 000000000..fba442f47 --- /dev/null +++ b/bolt/row/dense/IntVarint.h @@ -0,0 +1,687 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) +#include +#endif + +#include "bolt/common/base/BitUtil.h" +#include "bolt/type/HugeInt.h" + +// Integer varint codec for the dense row format. Organized bottom-up in five +// layers; each layer only calls the ones below it: +// +// L1 raw varint — LEB128 read/write (scalar / BMI2 / dispatchers) +// L2 zigzag — 64/128-bit sign-folding primitives +// L3 nullable codec — single-value wire mapping: null = 0x00, INT64_MIN +// sentinel, else varint(zigzag(adjust(v))) +// L4 SIMD size kernels — per-batch encoded-size computation (xsimd lanes) +// L5 column-level — whole-column size sum / per-row scatter loops +// +// Naming: a `*Batch` suffix marks an L4 pure kernel over one SIMD batch; +// un-suffixed L5 functions loop over a whole array. The scalar L3 size math and +// the L4 SIMD kernels intentionally duplicate the same formula — SIMD main +// loops need a scalar tail, and encode correctness relies on the two agreeing. +namespace bytedance::bolt::row::detail { + +// ============================================================================= +// L1 — Raw varint (LEB128): scalar + BMI2 implementations and dispatchers. +// ============================================================================= + +// A varint byte carries 7 payload bits; the high bit (0x80) is the +// continuation flag. The final byte of a varint is the one with it clear. +constexpr uint64_t kVarintPayloadBits{0x7f}; +// 8-byte-wide versions of the payload / continuation bit patterns, for the +// BMI2 pdep/pext paths that process 8 wire bytes per step. +constexpr uint64_t kVarintPayloadMask64{0x7f7f7f7f7f7f7f7fULL}; +constexpr uint64_t kVarintContinuationMask64{0x8080808080808080ULL}; + +FOLLY_ALWAYS_INLINE bool varintIsLastByte(uint8_t b) { + return (b & 0x80) == 0; +} + +FOLLY_ALWAYS_INLINE size_t varintSize(uint64_t value) { + const auto bits = 64 - __builtin_clzll(value | 1ULL); + return static_cast((bits + 6) / 7); +} + +FOLLY_ALWAYS_INLINE uint8_t* writeVarintScalar(uint64_t value, uint8_t* out) { + while (value >= 0x80) { + *out++ = static_cast(value) | 0x80; + value >>= 7; + } + *out++ = static_cast(value); + return out; +} + +// No bounds check: reads until the terminator, capped at 10 bytes by the +// varint structural limit (shift < 64). Over-reads only on malformed input; +// readVarint's single in <= end check validates the consumed length. +FOLLY_ALWAYS_INLINE bool readVarintScalar(const uint8_t*& in, uint64_t& value) { + uint64_t result{0}; + uint32_t shift{0}; + while (shift < 64) { + auto byte = *in++; + result |= ((byte & kVarintPayloadBits) << shift); + if (varintIsLastByte(byte)) { + value = result; + return true; + } + shift += 7; + } + return false; +} + +#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) + +constexpr std::array makeVarintContinuationMasks() { + std::array masks{}; + for (size_t len = 1; len < masks.size(); ++len) { + uint64_t mask{0}; + for (size_t i = 0; i + 1 < len; ++i) { + mask |= (0x80ULL << (i * 8)); + } + masks[len] = mask; + } + return masks; +} + +inline constexpr std::array kVarintContinuationMasks = + makeVarintContinuationMasks(); + +inline __attribute__((target("bmi2"))) uint8_t* writeVarintBmi2( + uint64_t value, + uint8_t* out) { + if (value < (1ULL << 56)) { + const auto bits = 64 - __builtin_clzll(value | 1ULL); + const auto len = static_cast((bits + 6) / 7); + + uint64_t packed = _pdep_u64(value, kVarintPayloadMask64); + // _pdep places only the 7 data bits. Set continuation bits (MSB=1) for + // the first len - 1 bytes; the last byte keeps MSB=0. + packed |= kVarintContinuationMasks[len]; + std::memcpy(out, &packed, len); + return out + len; + } + + // Values >= 2^63 require 10 bytes in unsigned varint form (e.g. + // zigzag(INT64_MAX) == 2^64 - 2). Encode the first 8 bytes with BMI2, + // then encode the remaining <=8 bits with scalar (1-2 bytes). + uint64_t packed = _pdep_u64(value, kVarintPayloadMask64); + packed |= kVarintContinuationMask64; + std::memcpy(out, &packed, 8); + out += 8; + return writeVarintScalar(value >> 56, out); +} + +inline __attribute__((target("bmi2"))) bool +readVarintBmi2(const uint8_t*& in, const uint8_t* end, uint64_t& value) { + // `end - in >= 8` is a memory-safety guard, NOT a redundant validity check: + // the 8-byte bulk load below would read past the buffer for a valid 5-7 byte + // varint in the final bytes (buffers are sized exactly, no tail padding). + // With < 8 bytes left, fall to the byte-at-a-time scalar reader. Truncation + // is caught by readVarint's single in <= end check. + if (end - in >= 8) { + uint64_t word; + std::memcpy(&word, in, sizeof(word)); + + const uint64_t stopMask = (~word) & kVarintContinuationMask64; + if (stopMask != 0) { + const auto len = + static_cast((__builtin_ctzll(stopMask) >> 3) + 1); + uint64_t decoded = _pext_u64(word, kVarintPayloadMask64); + if (len < 8) { + decoded &= ((1ULL << (len * 7)) - 1); + } + value = decoded; + in += len; + return true; + } + + // 9-10 byte varint: the first 8 bytes all continue, so a well-formed varint + // has its terminator within in[8..9] (in-bounds when end - in >= 9/10). No + // per-byte truncation check; readVarint's in <= end catches a short input. + uint64_t decoded = _pext_u64(word, kVarintPayloadMask64); + auto* cursor = in + 8; + const auto byte8 = *cursor++; + decoded |= (static_cast(byte8 & 0x7f) << 56); + if ((byte8 & 0x80) == 0) { + value = decoded; + in = cursor; + return true; + } + + const auto byte9 = *cursor++; + decoded |= (static_cast(byte9 & 0x1) << 63); + value = decoded; + in = cursor; + return true; + } + + return readVarintScalar(in, value); +} + +#endif + +// Inlined fast path for varints up to 3 bytes (values 0..2^21-1 = 2_097_151). +// Covers the dominant cases in null-fused encodings: +// - row markers (varint(0/1) → 1 byte) +// - VARCHAR lengths up to ~2M +// - BIGINT values in [-2^20, 2^20-1] after zigzag+adjust (covers lt_2pow8, +// lt_2pow16, and ~half of lt_2pow32 entries) +// - ARRAY/MAP cardinalities 0..2_097_151 +// +// The BMI2 path costs an 8-byte load + tzcnt + pext (10-15 cycle dep chain). +// Inlining up to 3 byte checks (each ~2 cycles) keeps the dep chain short +// and lets the OoO window see much more parallelism across rows. +// +// On failure (4+ byte varint or truncated input), caller falls back to +// BMI2/scalar. No bounds checks here: a well-formed varint stops at its +// terminator within the buffer; reads run past `end` only on malformed input, +// bounded to 4 bytes, and readVarint's single in <= end check validates the +// consumed length. +// Each length below reconstructs its whole value inside its own return, so the +// fall-through path (5+ byte varint) does no value arithmetic and the 1-byte +// case skips the payload mask — this is why it stays hand-unrolled rather than +// looped (a loop that accumulated the value each step measured ~1% slower on +// decode). Earlier bytes (b0..b{k-1}) are known to carry continuation bits, so +// only their payload (low 7) bits contribute; the terminating byte is whole. +FOLLY_ALWAYS_INLINE bool readVarintShortFastPath( + const uint8_t*& in, + uint64_t& value) { + constexpr uint64_t kP = kVarintPayloadBits; + + const uint8_t b0 = in[0]; + if (FOLLY_LIKELY(varintIsLastByte(b0))) { // 1 byte (< 2^7) + value = b0; + in += 1; + return true; + } + + const uint8_t b1 = in[1]; + if (FOLLY_LIKELY(varintIsLastByte(b1))) { // 2 bytes (< 2^14) + value = (b0 & kP) | (uint64_t{b1} << 7); + in += 2; + return true; + } + + const uint8_t b2 = in[2]; + if (FOLLY_LIKELY(varintIsLastByte(b2))) { // 3 bytes (< 2^21) + value = (b0 & kP) | ((b1 & kP) << 7) | (uint64_t{b2} << 14); + in += 3; + return true; + } + + const uint8_t b3 = in[3]; + if (FOLLY_LIKELY(varintIsLastByte(b3))) { // 4 bytes (< 2^28) + value = + (b0 & kP) | ((b1 & kP) << 7) | ((b2 & kP) << 14) | (uint64_t{b3} << 21); + in += 4; + return true; + } + return false; +} + +FOLLY_ALWAYS_INLINE uint8_t* writeVarint(uint64_t value, uint8_t* out) { + if (value < 0x80) { + *out++ = static_cast(value); + return out; + } + +#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) + if (value >= (1ULL << 35)) { + return writeVarintBmi2(value, out); + } +#endif + return writeVarintScalar(value, out); +} + +// Bounds are validated once here: the inner readers parse optimistically (no +// per-byte end checks) and may run a few bytes past `end` on malformed input; +// the single in <= end check rejects any read that over-ran the buffer. +FOLLY_ALWAYS_INLINE bool +readVarint(const uint8_t*& in, const uint8_t* end, uint64_t& value) { + if (FOLLY_UNLIKELY(in >= end)) { + return false; + } + if (FOLLY_LIKELY(readVarintShortFastPath(in, value))) { + return in <= end; + } +#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) + return readVarintBmi2(in, end, value) && in <= end; +#else + return readVarintScalar(in, value) && in <= end; +#endif +} + +// ============================================================================= +// L2 — ZigZag sign folding (64- and 128-bit). +// ============================================================================= + +FOLLY_ALWAYS_INLINE constexpr uint64_t zigZagEncode64(int64_t value) { + return (static_cast(value) << 1) ^ + static_cast(value >> 63); +} + +FOLLY_ALWAYS_INLINE constexpr int64_t zigZagDecode64(uint64_t encoded) { + return static_cast((encoded >> 1) ^ (0 - (encoded & 1))); +} + +FOLLY_ALWAYS_INLINE constexpr uint128_t zigZagEncode128(int128_t value) { + return (static_cast(value) << 1) ^ + static_cast(value >> 127); +} + +FOLLY_ALWAYS_INLINE constexpr int128_t zigZagDecode128(uint128_t encoded) { + return static_cast( + (encoded >> 1) ^ (~static_cast(0) * (encoded & 1))); +} + +// ============================================================================= +// L3 — Nullable-int wire codec (single value). +// ============================================================================= + +// Nullable int64 wire mapping: +// - null -> 0x00. +// - INT64_MIN -> 0x80 0x00 (reserved sentinel). +// - all other values -> varint(zigzag(adjust(v))), where adjust(v) keeps +// positive values unchanged and shifts non-positive values by -1. +// +// The reserved sentinel keeps null as a single-byte marker while preserving a +// one-to-one mapping for the full int64 domain. +constexpr std::array kInt64MinSentinel{0x80, 0x00}; + +FOLLY_ALWAYS_INLINE constexpr bool needsExtendedInt64Encoding(int64_t value) { + return value == std::numeric_limits::min(); +} + +FOLLY_ALWAYS_INLINE constexpr int64_t adjustInt64ForNullableEncoding( + int64_t value) { + return value > 0 ? value : value - 1; +} + +FOLLY_ALWAYS_INLINE constexpr int64_t restoreInt64FromNullableEncoding( + int64_t value) { + return value > 0 ? value : value + 1; +} + +FOLLY_ALWAYS_INLINE size_t +nullableInt64SerializedSize(int64_t value, bool isNull) { + if (isNull) { + return 1; + } + + if (needsExtendedInt64Encoding(value)) { + return 2; + } + + // size == ceil((bitlen(|v|)+1)/7) == varintSize(zigzag(adjust(v))) — the wire + // mapping only moves which 2^(7k-1) bucket the value lands in, which |v| + // already captures, so we skip the zigzag/adjust and clz |v| directly. + // INT64_MIN is excluded above, so the unsigned abs is exact. + const auto uv = static_cast(value); + const auto sign = static_cast(value >> 63); + const uint64_t mag = (uv ^ sign) - sign; // |value|, no signed-overflow UB + const auto bits = 64 - __builtin_clzll(mag | 1ULL); + return static_cast((bits + 7) / 7); +} + +FOLLY_ALWAYS_INLINE uint8_t* +writeNullableInt64(int64_t value, bool isNull, uint8_t* out) { + if (isNull) { + *out++ = 0; + return out; + } + + if (FOLLY_UNLIKELY(needsExtendedInt64Encoding(value))) { + *out++ = kInt64MinSentinel[0]; + *out++ = kInt64MinSentinel[1]; + return out; + } + + return writeVarint( + zigZagEncode64(adjustInt64ForNullableEncoding(value)), out); +} + +FOLLY_ALWAYS_INLINE bool readNullableInt64( + const uint8_t*& in, + const uint8_t* end, + bool& isNull, + int64_t& value) { + if (FOLLY_UNLIKELY(in >= end)) { + return false; + } + if (*in == 0) { + ++in; + isNull = true; + value = 0; + return in <= end; + } + + if (FOLLY_UNLIKELY( + in[0] == kInt64MinSentinel[0] && in[1] == kInt64MinSentinel[1])) { + in += 2; + isNull = false; + value = std::numeric_limits::min(); + return in <= end; + } + + uint64_t encoded{0}; + if (!readVarint(in, end, encoded)) { + return false; + } + + isNull = false; + value = restoreInt64FromNullableEncoding(zigZagDecode64(encoded)); + return in <= end; +} + +// Nullable int128 wire mapping (two halves of zigzag128(v), no separate tag): +// null -> nullableInt64(_, null) (a single 0x00 byte) +// non-null -> nullableInt64(low 64 of zigzag128(v)), varint(high 64). +// The null marker is folded into the low int64 slot via the nullable-int64 +// codec's own 0x00 sentinel, so there is no extra present/null tag byte: a +// non-null value's low half just rides the same slot, and the high half follows +// only when present. zigzag128 keeps small-magnitude values (either sign) +// short, and the two halves reuse the 64-bit varint path (no 128-bit varint). +// The low half is reinterpreted as int64 for the nullable-int64 codec; that +// round-trips bit-for-bit (it is a bijection over int64 plus null). +FOLLY_ALWAYS_INLINE size_t +nullableInt128SerializedSize(int128_t value, bool isNull) { + if (isNull) { + return nullableInt64SerializedSize(0, /*isNull=*/true); + } + const uint128_t zz = zigZagEncode128(value); + return nullableInt64SerializedSize( + static_cast(static_cast(zz)), + /*isNull=*/false) + + varintSize(static_cast(zz >> 64)); +} + +FOLLY_ALWAYS_INLINE uint8_t* +writeNullableInt128(int128_t value, bool isNull, uint8_t* out) { + if (isNull) { + return writeNullableInt64(0, /*isNull=*/true, out); + } + const uint128_t zz = zigZagEncode128(value); + out = writeNullableInt64( + static_cast(static_cast(zz)), /*isNull=*/false, out); + return writeVarint(static_cast(zz >> 64), out); +} + +FOLLY_ALWAYS_INLINE bool readNullableInt128( + const uint8_t*& in, + const uint8_t* end, + bool& isNull, + int128_t& value) { + int64_t low{0}; + if (!readNullableInt64(in, end, isNull, low)) { + return false; + } + if (isNull) { + value = 0; + return true; + } + uint64_t hi{0}; + if (!readVarint(in, end, hi)) { + return false; + } + value = zigZagDecode128( + (static_cast(hi) << 64) | static_cast(low)); + return true; +} + +// ============================================================================= +// L4 — Portable (xsimd) nullable-int size kernels: encoded byte counts for one +// SIMD batch of values. Two kernels: int32 and int64; int8/int16 widen to +// int32. The size math is pure xsimd::batch, so it runs on AVX2 / AVX-512 / +// SSE / NEON (compatibility). These must agree with the scalar +// nullableInt64SerializedSize above. +// ============================================================================= + +// int32: width-adaptive, native uint32 lanes. zigzag of an int32 fits uint32 +// for every value except INT32_MIN (special-cased to 5). 4 thresholds; xsimd's +// unsigned-batch comparison handles the unsigned compare portably. +FOLLY_ALWAYS_INLINE xsimd::batch nullableInt32SizesBatch( + xsimd::batch v) { + using S = xsimd::batch; + using U = xsimd::batch; + const S zero(0); + const S adj = v - S(v <= zero); // v > 0 ? v : v - 1 + const U zz = xsimd::bitwise_cast((adj << 1) ^ (adj >> 31)); // zigzag + U s(1); + s += U(zz > U(static_cast((1 << 7) - 1))); + s += U(zz > U(static_cast((1 << 14) - 1))); + s += U(zz > U(static_cast((1 << 21) - 1))); + s += U(zz > U(static_cast((1 << 28) - 1))); + return xsimd::select( + xsimd::batch_bool_cast( + v == S(std::numeric_limits::min())), + U(5), + s); +} + +// int64 size kernel, computed straight from |v| (no zigzag/adjust): size(v) = +// min k with |v| < 2^(7k-1), i.e. threshold |v| against {2^6, 2^13, ... 2^62}. +// The >=2^63 (10-byte) case is just the top threshold — no separate sign fixup. +// INT64_MIN's abs overflows back to a negative value, so all thresholds miss +// (s=1) and the final select sets it to the 2-byte sentinel. Branchless. +// +// This abs form measured ~20% faster than an equivalent zigzag-based kernel (9 +// magnitude thresholds replace zigzag's 8 thresholds + a zz<0 select, and they +// sit on a shorter dependency chain since |v| is cheaper to derive than the +// zigzag key), so it is the single int64 size kernel used everywhere. +FOLLY_ALWAYS_INLINE xsimd::batch nullableInt64SizesBatch( + xsimd::batch v) { + using B = xsimd::batch; + const B sign = v >> 63; + const B m = (v ^ sign) - sign; // abs(v); INT64_MIN stays negative + // 9 magnitude thresholds (emulating a clz, which AVX2 can't vectorize). The + // serial `s +=` is not the bottleneck — the compiler reassociates these + // associative adds into a tree, and an explicit tree measured identically. + B s = B(1); + s += B(m > B((1LL << 6) - 1)); + s += B(m > B((1LL << 13) - 1)); + s += B(m > B((1LL << 20) - 1)); + s += B(m > B((1LL << 27) - 1)); + s += B(m > B((1LL << 34) - 1)); + s += B(m > B((1LL << 41) - 1)); + s += B(m > B((1LL << 48) - 1)); + s += B(m > B((1LL << 55) - 1)); + s += B(m > B((1LL << 62) - 1)); + return xsimd::select(v == B(std::numeric_limits::min()), B(2), s); +} + +// ============================================================================= +// L5 — Column-level loops: size sums and per-row scatters over whole arrays, +// built on the L4 kernels with scalar (L3) tails. +// +// Narrow int8/int16 inputs widen to the int32 size kernel via xsimd's +// converting load `batch::load_unaligned(const T*)`, which reads +// batch::size narrow values and sign-extends each to an int32 lane. +// It picks the right widening per ISA (AVX2/AVX-512/SSE/NEON) with no +// target-specific intrinsics; for int32 input it is a plain load. +// ============================================================================= + +// Sum of nullable-int sizes for a contiguous non-null range of any of +// int8/int16/int32/int64. Narrow types widen to the int32 kernel. int64 gets a +// testz-style fast path: if every zigzag in a batch fits 32 bits (common +// small-magnitude BIGINT) only 4 thresholds are needed; xsimd::all keeps it +// portable. +template +FOLLY_ALWAYS_INLINE size_t sumNullableIntSizes(const T* raw, size_t count) { + static_assert( + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v, + "sumNullableIntSizes supports int8/int16/int32/int64"); + if constexpr (std::is_same_v) { + using B = xsimd::batch; + constexpr std::size_t kBatchSize = B::size; + const B zero(0); + const B one(1); + B acc(0); + std::size_t j = 0; + for (; j + kBatchSize <= count; j += kBatchSize) { + const B v = B::load_unaligned(raw + j); + const B adj = v - B(v <= zero); + const B zz = (adj << 1) ^ (adj >> 63); + if (xsimd::all((zz >> 32) == zero)) { + B s = one; + s += B(zz > B((1LL << 7) - 1)); + s += B(zz > B((1LL << 14) - 1)); + s += B(zz > B((1LL << 21) - 1)); + s += B(zz > B((1LL << 28) - 1)); + acc += s; + } else { + acc += nullableInt64SizesBatch(v); + } + } + auto total = static_cast(xsimd::reduce_add(acc)); + for (; j < count; ++j) { + total += nullableInt64SerializedSize(raw[j], false); + } + return total; + } else { + using U = xsimd::batch; + constexpr std::size_t kBatchSize = xsimd::batch::size; + U acc(0U); + std::size_t j = 0; + for (; j + kBatchSize <= count; j += kBatchSize) { + acc += nullableInt32SizesBatch( + xsimd::batch::load_unaligned(raw + j)); + } + auto total = static_cast(xsimd::reduce_add(acc)); + for (; j < count; ++j) { + total += nullableInt64SerializedSize(static_cast(raw[j]), false); + } + return total; + } +} + +// Per-row scatter for a non-null column: add each value's size into its own +// rowSizes[r]. +template +FOLLY_ALWAYS_INLINE void +addNoNullIntColumnSizes(const T* raw, size_t* rowSizes, size_t count) { + std::size_t j = 0; + if constexpr (std::is_same_v) { + // Sizes are int64 already: add the batch straight into rowSizes (portable). + // Branchless (no testz fastpath): a per-batch "all small" branch regresses + // mixed/full-range BIGINT via misprediction, which dominates the + // small-value saving. + using B = xsimd::batch; + constexpr std::size_t kWidth = B::size; + auto* rs = reinterpret_cast(rowSizes); + for (; j + kWidth <= count; j += kWidth) { + B sz = nullableInt64SizesBatch(B::load_unaligned(raw + j)); + (B::load_unaligned(rs + j) + sz).store_unaligned(rs + j); + } + } else { + constexpr std::size_t kWidth = xsimd::batch::size; + alignas(64) uint32_t sz[kWidth]; + for (; j + kWidth <= count; j += kWidth) { + nullableInt32SizesBatch(xsimd::batch::load_unaligned(raw + j)) + .store_aligned(sz); + for (std::size_t k = 0; k < kWidth; ++k) { + rowSizes[j + k] += sz[k]; + } + } + } + // reset of rows + for (; j < count; ++j) { + rowSizes[j] += + nullableInt64SerializedSize(static_cast(raw[j]), false); + } +} + +// Overload for a FLAT int column WITH nulls (the common Spark case that +// otherwise falls to the scalar loop). A null row contributes exactly 1 byte +// (the 0x00 marker), independent of the (garbage) value stored at its slot. +// `nulls` is the row-indexed validity bitmap (bit set = non-null), which the +// caller guarantees non-null. Supports int8/int16/int32/int64; narrow types +// widen to the int32 size kernel exactly like addNoNullIntColumnSizes. +template +FOLLY_ALWAYS_INLINE void addNullableIntColumnSizes( + const T* raw, + const uint64_t* nulls, + size_t* rowSizes, + size_t count) { + std::size_t j = 0; + if constexpr (std::is_same_v) { + using B = xsimd::batch; + constexpr std::size_t kBatchSize = B::size; + // Validity bitmaps are addressed in 64-bit words. + constexpr std::size_t kWordBits = 64; + auto* rs = reinterpret_cast(rowSizes); + + // Per-lane bit selector {1<<0, 1<<1, ...}, built once. + int64_t selArr[kBatchSize]; + for (std::size_t i = 0; i < kBatchSize; ++i) { + selArr[i] = static_cast(int64_t{1} << i); + } + const B laneSel = B::load_aligned(selArr); + const B one(1); + + for (; j + kBatchSize <= count; j += kBatchSize) { + B sz = nullableInt64SizesBatch(B::load_unaligned(raw + j)); + // kBatchSize validity bits for rows [j, j+kBatchSize). kBatchSize divides + // kWordBits and j is a multiple of kBatchSize, so the bits never straddle + // a word. + const uint64_t word = nulls[j / kWordBits]; + const uint64_t validBits = (word >> (j % kWordBits)) & + bits::lowMask(static_cast(kBatchSize)); + // lane i valid iff bit i set: (broadcast(validBits) & laneSel) == + // laneSel. + const auto isValid = + (B(static_cast(validBits)) & laneSel) == laneSel; + sz = xsimd::select(isValid, sz, one); // null -> 1 byte + (B::load_unaligned(rs + j) + sz).store_unaligned(rs + j); + } + } else { + // Narrow int8/int16/int32: widen to the int32 size kernel, then override + // null lanes to 1 byte. + constexpr std::size_t kWidth = xsimd::batch::size; + uint32_t sz[kWidth]; + for (; j + kWidth <= count; j += kWidth) { + nullableInt32SizesBatch(xsimd::batch::load_unaligned(raw + j)) + .store_aligned(sz); + for (std::size_t k = 0; k < kWidth; ++k) { + const bool isNull = !bits::isBitSet(nulls, static_cast(j + k)); + rowSizes[j + k] += isNull ? 1U : sz[k]; + } + } + } + for (; j < count; ++j) { + const bool isNull = !bits::isBitSet(nulls, static_cast(j)); + rowSizes[j] += nullableInt64SerializedSize( + isNull ? 0 : static_cast(raw[j]), isNull); + } +} + +} // namespace bytedance::bolt::row::detail diff --git a/bolt/row/tests/CMakeLists.txt b/bolt/row/tests/CMakeLists.txt index 363687332..b2c8e0560 100644 --- a/bolt/row/tests/CMakeLists.txt +++ b/bolt/row/tests/CMakeLists.txt @@ -25,13 +25,15 @@ # This modified file is released under the same license. # -------------------------------------------------------------------------- -add_executable(bolt_row_test CompactRowTest.cpp UnsafeRowTest.cpp) +add_executable(bolt_row_test CompactRowTest.cpp UnsafeRowTest.cpp + DenseRowTest.cpp) add_test(bolt_row_test bolt_row_test) target_link_libraries( bolt_row_test - PRIVATE bolt_testutils + PRIVATE bolt_row_fast + bolt_testutils Folly::folly GTest::gtest GTest::gtest_main diff --git a/bolt/row/tests/DenseRowTest.cpp b/bolt/row/tests/DenseRowTest.cpp new file mode 100644 index 000000000..db4f1f931 --- /dev/null +++ b/bolt/row/tests/DenseRowTest.cpp @@ -0,0 +1,388 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "bolt/row/dense/DenseRow.h" +#include "bolt/vector/fuzzer/VectorFuzzer.h" +#include "bolt/vector/tests/utils/VectorTestBase.h" + +using namespace bytedance::bolt::test; + +namespace bytedance::bolt::row { +namespace { + +class DenseRowTest : public ::testing::Test, public VectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance(memory::MemoryManager::Options{}); + } + + // Serialize a RowVector into one contiguous buffer plus (N + 1) cumulative + // offsets — the test-side equivalent of how shuffle lays out a partition + // buffer from DenseRow::rowSizes() and then DenseRow::serialize()s into it. + struct Bytes { + std::vector buffer; + std::vector offsets; // size N + 1 + + std::string_view toView(size_t index) const { + return std::string_view( + reinterpret_cast(buffer.data()) + offsets[index], + offsets[index + 1] - offsets[index]); + } + + std::string toHex(size_t index) const { + auto view = toView(index); + std::string out; + out.reserve(view.size() * 2); + static constexpr char kHex[] = "0123456789abcdef"; + for (unsigned char c : view) { + out.push_back(kHex[c >> 4]); + out.push_back(kHex[c & 0x0f]); + } + return out; + } + }; + + static Bytes serializeToBytes(const RowVectorPtr& input) { + DenseRow rows(input); + const auto n = rows.numRows(); + Bytes out; + out.offsets.resize(n + 1); + size_t cum = 0; + for (vector_size_t r = 0; r < n; ++r) { + out.offsets[r] = cum; + cum += rows.rowSizes()[r]; + } + out.offsets[n] = cum; + EXPECT_EQ(cum, rows.totalSize()); + out.buffer.resize(std::max(cum, 1)); + rows.serialize( + out.buffer.data(), folly::Range(out.offsets.data(), n)); + return out; + } + + // DenseRow is marker-less (no top-level null rows), so rebuild a null-free + // RowVector from the fuzzed input's children, then serialize -> split -> + // deserialize and compare. + void roundTrip(const RowVectorPtr& fuzzed) { + auto input = makeRowVector(fuzzed->children()); + const auto rowType = + std::dynamic_pointer_cast(input->type()); + ASSERT_NE(rowType, nullptr); + const auto n = input->size(); + + auto bytes = serializeToBytes(input); + std::vector data(n); + for (vector_size_t r = 0; r < n; ++r) { + data[r] = bytes.toView(r); + } + auto out = DenseRow::deserialize(data, rowType, pool()); + assertEqualVectors(input, out); + } + + VectorPtr + fuzzVector(const TypePtr& type, vector_size_t size, uint32_t seed = 7) { + VectorFuzzer::Options opts; + opts.vectorSize = size; + opts.nullRatio = 0.2; + opts.dictionaryHasNulls = false; + opts.stringVariableLength = true; + opts.stringLength = 24; + opts.containerVariableLength = true; + opts.containerLength = 7; + opts.timestampPrecision = + VectorFuzzer::Options::TimestampPrecision::kMicroSeconds; + + VectorFuzzer fuzzer(opts, pool(), seed); + return fuzzer.fuzzFlat(type, size); + } +}; + +TEST_F(DenseRowTest, dictionaryEncodedInput) { + // DenseRow decodes via DecodedVector (buildPlan), so dictionary-wrapped + // inputs round-trip. + auto base = makeFlatVector({100, 200, 300, 400}); + auto indices = makeIndicesInReverse(4); + auto dict = BaseVector::wrapInDictionary(nullptr, indices, 4, base); + roundTrip(makeRowVector({dict})); +} + +TEST_F(DenseRowTest, constantEncodedInput) { + // Constant-wrapped scalars decode via DecodedVector with isConstantMapping(); + // the scalar encoder sizes them once and splats. Cover a non-null constant, + // a null constant, and a constant in a wide row alongside a flat column. + roundTrip(makeRowVector({makeConstant(987654321, 16)})); + roundTrip(makeRowVector({makeNullConstant(TypeKind::BIGINT, 16)})); + roundTrip(makeRowVector({ + makeConstant(-7, 16), + makeConstant(123, 16), + makeNullConstant(TypeKind::TINYINT, 16), + makeFlatVector(16, [](auto r) { return r * 3 - 5; }), + })); +} + +TEST_F(DenseRowTest, rowOfScalars) { + roundTrip(std::dynamic_pointer_cast(fuzzVector( + ROW({BIGINT(), INTEGER(), REAL(), DOUBLE(), VARCHAR()}), 128, 11))); +} + +TEST_F(DenseRowTest, multiScalarWideRow) { + // 10-column flat ROW covering every supported scalar leaf encoder. + auto type = ROW({ + BIGINT(), + INTEGER(), + SMALLINT(), + TINYINT(), + BOOLEAN(), + REAL(), + DOUBLE(), + VARCHAR(), + TIMESTAMP(), + BIGINT(), + }); + roundTrip(std::dynamic_pointer_cast(fuzzVector(type, 256, 17))); +} + +TEST_F(DenseRowTest, bigintEdges) { + auto bigint = makeFlatVector({ + std::numeric_limits::min(), + std::numeric_limits::max(), + -1, + 0, + 1, + }); + roundTrip(makeRowVector({bigint})); +} + +// HUGEINT (128-bit, used by DECIMAL(precision > 18, *)). Cover null, zero, +// small, negative, and INT128 edges. +TEST_F(DenseRowTest, hugeintEdges) { + using int128_t = __int128_t; + const int128_t kMax = (int128_t{1} << 126) + ((int128_t{1} << 126) - 1); + const int128_t kMin = -kMax - 1; + auto values = makeNullableFlatVector( + {kMin, + kMax, + int128_t{-1}, + int128_t{0}, + int128_t{1}, + std::nullopt, + int128_t{1234567890123456789LL}}); + roundTrip(makeRowVector({values})); +} + +// 16-column "Mix" schema from the production shuffle matrix tests: every +// supported scalar plus ARRAY/MAP/ROW. +TEST_F(DenseRowTest, mixWideRow) { + auto type = ROW({ + BOOLEAN(), + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + DECIMAL(10, 2), + DECIMAL(38, 18), + REAL(), + DOUBLE(), + VARCHAR(), + VARBINARY(), + DATE(), + TIMESTAMP(), + ARRAY(INTEGER()), + MAP(VARCHAR(), BIGINT()), + ROW({INTEGER(), VARCHAR()}), + }); + roundTrip(std::dynamic_pointer_cast(fuzzVector(type, 256, 41))); +} + +// A top-level ROW whose nested-ROW child is dictionary-wrapped +TEST_F(DenseRowTest, dictionaryWrappedNestedRow) { + auto innerInts = makeFlatVector({100, 200, 300, 400}); + auto innerStrs = makeFlatVector({"aaa", "bbb", "ccc", "ddd"}); + auto baseNestedRow = makeRowVector({innerInts, innerStrs}); + + const std::vector dictIndices = {3, 0, 2, 1, 0, 3}; + auto indicesBuf = + AlignedBuffer::allocate(dictIndices.size(), pool()); + std::memcpy( + indicesBuf->asMutable(), + dictIndices.data(), + dictIndices.size() * sizeof(vector_size_t)); + auto dictNestedRow = BaseVector::wrapInDictionary( + nullptr, + indicesBuf, + static_cast(dictIndices.size()), + baseNestedRow); + + auto bigintCol = makeFlatVector({10, 20, 30, 40, 50, 60}); + roundTrip(makeRowVector({bigintCol, dictNestedRow})); +} + +TEST_F(DenseRowTest, arrayOfBigint) { + roundTrip(std::dynamic_pointer_cast( + fuzzVector(ROW({ARRAY(BIGINT())}), 128, 12))); +} + +TEST_F(DenseRowTest, arrayOfArrayOfBigint) { + roundTrip(std::dynamic_pointer_cast( + fuzzVector(ROW({ARRAY(ARRAY(BIGINT()))}), 256, 13))); +} + +TEST_F(DenseRowTest, mapBigintReal) { + roundTrip(std::dynamic_pointer_cast( + fuzzVector(ROW({MAP(BIGINT(), REAL())}), 128, 14))); +} + +TEST_F(DenseRowTest, nestedRowOfMixedFields) { + auto type = ROW({ + BIGINT(), + ARRAY(VARCHAR()), + MAP(INTEGER(), ARRAY(BIGINT())), + ROW({INTEGER(), VARCHAR()}), + }); + roundTrip(std::dynamic_pointer_cast(fuzzVector(type, 128, 15))); +} + +TEST_F(DenseRowTest, emptyContainers) { + auto input = makeRowVector({ + makeArrayVector({{}, {}, {}}), + makeMapVector({{}, {}, {}}), + makeNestedArrayVectorFromJson({"[]", "[[]]", "[]"}), + }); + roundTrip(input); +} + +// Golden bytes pin the (marker-less) level-hoisted wire for +// ARRAY>. Row 0: [[1,2,3],[4,5,6]]; row 1: [[7],[8,9]]. +TEST_F(DenseRowTest, goldenBytesNestedArrays) { + auto input = makeRowVector({ + makeNestedArrayVectorFromJson( + {"[[1,2,3],[4,5,6]]", "[[7],[8,9]]"}), + }); + auto bytes = serializeToBytes(input); + // Row 0: 03 (outer=2+1) | 04 04 (inner=3+1,3+1) | 02 04 06 08 0a 0c (zz 1..6) + EXPECT_EQ(bytes.toHex(0), "030404020406080a0c"); + // Row 1: 03 (outer) | 02 03 (inner=1+1,2+1) | 0e 10 12 (zz 7,8,9) + EXPECT_EQ(bytes.toHex(1), "0302030e1012"); +} + +// Golden bytes for MAP with hoisted key/value segments. +// Row 0: {1 -> 1.5, 2 -> 2.5}. +TEST_F(DenseRowTest, goldenBytesMapHoistedKV) { + auto input = makeRowVector({ + makeMapVector({{{1, 1.5f}, {2, 2.5f}}}), + }); + auto bytes = serializeToBytes(input); + // 03 (card=2+1) | 02 04 (keys zz 1,2) | 0000c03f 00002040 (1.5f, 2.5f LE) + EXPECT_EQ( + bytes.toHex(0), + "030204" + "0000c03f" + "00002040"); +} + +// Golden bytes for the top-level all-scalar ROW shape (the slot-free fast +// path). Per-row layout (marker-less): +// [bigint][int][varchar_len+1|payload][real]. +TEST_F(DenseRowTest, goldenBytesScalarRow) { + auto type = ROW({BIGINT(), INTEGER(), VARCHAR(), REAL()}); + auto bigint = makeFlatVector({1, -1}); + auto integer = makeNullableFlatVector({2, std::nullopt}); + auto varchar = makeFlatVector({"ab", ""}); + auto real = makeNullableFlatVector({1.5f, std::nullopt}); + auto input = makeRowVector({bigint, integer, varchar, real}); + + auto bytes = serializeToBytes(input); + // Row 0: bigint zz(1)=02, int zz(2)=04, varchar(len=2,"ab")=03 6162, + // real 1.5f bits 0x3fc00000 LE = 0000c03f. + EXPECT_EQ( + bytes.toHex(0), + "0204" + "036162" + "0000c03f"); + // Row 1: bigint zz(adjust(-1))=zz(-2)=3 -> 03, int null=00, + // varchar(len=0)=01, real null = kNullFloatBits LE = 0000c07f. + EXPECT_EQ( + bytes.toHex(1), + "03" + "00" + "01" + "0000c07f"); + + // Round-trip restores the original. + std::vector rows(2); + for (vector_size_t r = 0; r < 2; ++r) { + rows[r] = bytes.toView(r); + } + assertEqualVectors(input, DenseRow::deserialize(rows, type, pool())); +} + +// Drive serialize() with reverse-order, gapped destination offsets to confirm +// each row's bytes land exactly where the offset table says and nowhere else. +TEST_F(DenseRowTest, serializeAtNonContiguousOffsets) { + auto type = ROW({BIGINT(), VARCHAR(), ARRAY(INTEGER())}); + VectorFuzzer::Options opts; + opts.vectorSize = 8; + opts.nullRatio = 0.0; + opts.stringLength = 12; + opts.containerLength = 4; + VectorFuzzer fuzzer(opts, pool(), 41); + auto input = std::dynamic_pointer_cast( + fuzzer.fuzzFlat(type, opts.vectorSize)); + const auto rowType = std::dynamic_pointer_cast(input->type()); + const auto n = input->size(); + + DenseRow rows(input); + const auto& sizes = rows.rowSizes(); + + // Reverse buffer order, 7-byte gaps pre-filled with 0xCC. + constexpr size_t kGap = 7; + std::vector offsets(n); + size_t cum = 0; + for (vector_size_t r = 0; r < n; ++r) { + const auto srcRow = static_cast(n - 1 - r); + offsets[srcRow] = cum; + cum += sizes[srcRow] + kGap; + } + std::vector buffer(cum, /*fill=*/0xCC); + + rows.serialize( + buffer.data(), + folly::Range(offsets.data(), offsets.size())); + + std::vector claimed(cum, false); + for (vector_size_t r = 0; r < n; ++r) { + for (uint32_t i = offsets[r]; i < offsets[r] + sizes[r]; ++i) { + claimed[i] = true; + } + } + for (uint32_t i = 0; i < cum; ++i) { + if (!claimed[i]) { + EXPECT_EQ(buffer[i], 0xCC) << "gap byte at " << i << " was overwritten"; + } + } + + std::vector data(n); + for (vector_size_t r = 0; r < n; ++r) { + data[r] = std::string_view( + reinterpret_cast(buffer.data() + offsets[r]), sizes[r]); + } + assertEqualVectors(input, DenseRow::deserialize(data, rowType, pool())); +} + +} // namespace +} // namespace bytedance::bolt::row diff --git a/bolt/shuffle/sparksql/BoltRowBasedSortShuffleWriter.cpp b/bolt/shuffle/sparksql/BoltRowBasedSortShuffleWriter.cpp index 92f7fd4c6..873559ed2 100644 --- a/bolt/shuffle/sparksql/BoltRowBasedSortShuffleWriter.cpp +++ b/bolt/shuffle/sparksql/BoltRowBasedSortShuffleWriter.cpp @@ -129,7 +129,10 @@ arrow::Status BoltRowBasedSortShuffleWriter::split( pidArr, rv->size(), row2Partition_, partition2RowCount_)); strippedRv = getStrippedRowVectorWrapper(*rv); } - auto rowVectorWithStats = rowConverter_->getWithStats(strippedRv); + auto rowVectorWithStats = [&]() { + bytedance::bolt::NanosecondTimer timer(&convertTime_); + return rowConverter_->getWithStats(strippedRv); + }(); if (!boltPool_->maybeReserve(rowVectorWithStats.getTotalMemorySize())) { if (boltPool_->reservedBytes() >= kMinMemLimit) { RETURN_NOT_OK(tryEvict()); @@ -160,8 +163,8 @@ arrow::Status BoltRowBasedSortShuffleWriter::initFromRowVector( const bytedance::bolt::RowVector& rv) { // rv is not stripped auto&& rowType = getStrippedRowVectorType(rv); - rowConverter_ = - std::make_unique(rowType, boltPool_); + rowConverter_ = std::make_unique( + rowType, boltPool_, options_.rowFormat); sortedRows_.resize(numPartitions_); partitionBytes_.resize(numPartitions_, 0); return arrow::Status::OK(); diff --git a/bolt/shuffle/sparksql/BoltShuffleReader.cpp b/bolt/shuffle/sparksql/BoltShuffleReader.cpp index 6a5c6d045..760ff89de 100644 --- a/bolt/shuffle/sparksql/BoltShuffleReader.cpp +++ b/bolt/shuffle/sparksql/BoltShuffleReader.cpp @@ -763,8 +763,8 @@ BoltColumnarBatchDeserializerFactory::createDeserializer( zstdCodec_ = std::make_shared( 1 /*not used*/, false, memoryPool_, checksumEnabled_); rowBufferPool_ = std::make_shared(memoryPool_); - row2ColConverter_ = - std::make_shared(rowType_, boltPool_); + row2ColConverter_ = std::make_shared( + rowType_, boltPool_, rowFormat_); } return std::make_unique( std::move(in), @@ -867,6 +867,7 @@ BoltShuffleReader::BoltShuffleReader( factory_->setNumPartitions(options.numPartitions); factory_->setShuffleWriterType(options.forceShuffleWriterType); factory_->setpartitioningShortName(options.partitionShortName); + factory_->setRowFormat(options.rowFormat); } } // namespace bytedance::bolt::shuffle::sparksql diff --git a/bolt/shuffle/sparksql/BoltShuffleReader.h b/bolt/shuffle/sparksql/BoltShuffleReader.h index 624e18c07..85207a1a2 100644 --- a/bolt/shuffle/sparksql/BoltShuffleReader.h +++ b/bolt/shuffle/sparksql/BoltShuffleReader.h @@ -217,6 +217,10 @@ class BoltColumnarBatchDeserializerFactory { partitioningShortName_ = name; } + void setRowFormat(bytedance::bolt::row::RowFormat rowFormat) { + rowFormat_ = rowFormat; + } + private: std::shared_ptr schema_; std::shared_ptr codec_; @@ -226,6 +230,8 @@ class BoltColumnarBatchDeserializerFactory { int32_t numPartitions_{0}; ShuffleWriterType shuffleWriterType_{ShuffleWriterType::V1}; std::string partitioningShortName_; + bytedance::bolt::row::RowFormat rowFormat_{ + bytedance::bolt::row::RowFormat::COMPACT}; arrow::MemoryPool* memoryPool_; bytedance::bolt::memory::MemoryPool* boltPool_; diff --git a/bolt/shuffle/sparksql/Options.h b/bolt/shuffle/sparksql/Options.h index bcc10dbb1..358a26b74 100644 --- a/bolt/shuffle/sparksql/Options.h +++ b/bolt/shuffle/sparksql/Options.h @@ -36,6 +36,7 @@ #include #include #include +#include "bolt/row/RowFormat.h" #include "bolt/shuffle/sparksql/compression/Codec.h" #include "bolt/shuffle/sparksql/partition_writer/rss/RssClient.h" #include "bolt/shuffle/sparksql/partitioner/Partitioning.h" @@ -102,6 +103,9 @@ struct ShuffleReaderOptions { std::string partitionShortName = ""; int32_t forceShuffleWriterType = -1; + // On-wire row format for the row-based shuffle. Must match the writer side. + row::RowFormat rowFormat = row::RowFormat::COMPACT; + // Enable checksum in codec for shuffle data corruption detection bool checksumEnabled = true; }; @@ -160,6 +164,7 @@ struct ShuffleWriterOptions { int32_t recommendedColumn2RowSize = 0; double shuffleCheckRatio = 0; int32_t shuffleCheckMaxColumns = kDefaultShuffleCheckMaxColumns; + row::RowFormat rowFormat = row::RowFormat::COMPACT; PartitionWriterOptions partitionWriterOptions{}; }; diff --git a/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.cpp b/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.cpp index 801470419..86373d20c 100644 --- a/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.cpp +++ b/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.cpp @@ -31,36 +31,46 @@ #include "bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h" #include +#include #include #include +#include +#include #include "bolt/row/CompactRow.h" +#include "bolt/row/dense/DenseRow.h" using namespace bytedance; namespace bytedance::bolt::shuffle::sparksql { void ShuffleColumnarToRowConverter::init( const bytedance::bolt::RowTypePtr& rowType) { - if (auto fixedRowSize = bolt::row::CompactRow::fixedRowSize(rowType)) { - fixedRowSize_ = fixedRowSize.value(); + if (rowFormat_ == row::RowFormat::COMPACT) { + if (auto fixedRowSize = bolt::row::CompactRow::fixedRowSize(rowType)) { + fixedRowSize_ = fixedRowSize.value(); + } } } - ShuffleColumnarToRowConverter::RowVectorWithStats ShuffleColumnarToRowConverter::getWithStats( const bytedance::bolt::RowVectorPtr& rowVector) { RowVectorWithStats stats; - stats.compactRow = std::make_shared(rowVector); stats.numRows = rowVector->size(); stats.totalMemorySize = 0; auto numRows = rowVector->size(); - if (fixedRowSize_) { - stats.totalMemorySize = fixedRowSize_ * numRows; - } else { - for (auto i = 0; i < numRows; ++i) { - stats.totalMemorySize += stats.compactRow->rowSize(i); + if (rowFormat_ == row::RowFormat::COMPACT) { + stats.compactRow = std::make_unique(rowVector); + if (fixedRowSize_) { + stats.totalMemorySize = fixedRowSize_ * numRows; + } else { + for (auto i = 0; i < numRows; ++i) { + stats.totalMemorySize += stats.compactRow->rowSize(i); + } } + } else { + stats.denseRow = std::make_unique(rowVector); + stats.totalMemorySize = static_cast(stats.denseRow->totalSize()); } - // layout : rowSize | unsafeRow + // layout : rowSize | rowData stats.totalMemorySize += numRows * kSizeOfRowHeader; return stats; } @@ -70,13 +80,33 @@ void ShuffleColumnarToRowConverter::convert( const std::vector& indexes, std::vector>& sortedRows, std::vector& partitionBytes) { - auto numRows = rowVector.numRows; + const auto numRows = rowVector.numRows; totalBufferSize_ += rowVector.totalMemorySize; boltBuffers_.emplace_back( RowInternalBuffer::allocate(rowVector.totalMemorySize, boltPool_)); bufferAddress_ = boltBuffers_.back()->mutable_data(); - memset(bufferAddress_, 0, sizeof(int8_t) * rowVector.totalMemorySize); averageRowSize_ = numRows ? (rowVector.totalMemorySize / numRows) : 0; + + if (rowFormat_ == row::RowFormat::DENSE) { + const std::vector& rowSizesVec = rowVector.denseRow->rowSizes(); + std::vector bodyOffsets(numRows); + uint32_t cursor = 0; + for (int64_t r = 0; r < numRows; ++r) { + const auto rowSize = static_cast(rowSizesVec[r]); + *reinterpret_cast(bufferAddress_ + cursor) = rowSize; + bodyOffsets[r] = cursor + kSizeOfRowHeader; + sortedRows[indexes[r]].push_back(bufferAddress_ + cursor); + partitionBytes[indexes[r]] += rowSize + kSizeOfRowHeader; + cursor += static_cast(rowSize) + kSizeOfRowHeader; + } + + rowVector.denseRow->serialize( + bufferAddress_, + folly::Range(bodyOffsets.data(), bodyOffsets.size())); + return; + } + + std::memset(bufferAddress_, 0, rowVector.totalMemorySize); size_t offset = kSizeOfRowHeader; for (auto i = 0; i < numRows; ++i) { auto rowSize = diff --git a/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h b/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h index 323b2ba37..66191e582 100644 --- a/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h +++ b/bolt/shuffle/sparksql/ShuffleColumnarToRowConverter.h @@ -34,8 +34,14 @@ #include #include +#include +#include +#include + #include "bolt/buffer/Buffer.h" #include "bolt/row/CompactRow.h" +#include "bolt/row/RowFormat.h" +#include "bolt/row/dense/DenseRow.h" #include "bolt/vector/ComplexVector.h" namespace bytedance::bolt::shuffle::sparksql { static const uint32_t kSizeOfRowHeader = sizeof(int32_t); @@ -82,8 +88,10 @@ class ShuffleColumnarToRowConverter { public: explicit ShuffleColumnarToRowConverter( const bytedance::bolt::RowTypePtr& rowType, - bytedance::bolt::memory::MemoryPool* boltPool) - : boltPool_(boltPool) { + bytedance::bolt::memory::MemoryPool* boltPool, + bytedance::bolt::row::RowFormat rowFormat = + bytedance::bolt::row::RowFormat::COMPACT) + : boltPool_(boltPool), rowFormat_(rowFormat) { init(rowType); } @@ -96,9 +104,10 @@ class ShuffleColumnarToRowConverter { } private: - std::shared_ptr compactRow; - int64_t numRows; - int64_t totalMemorySize; + std::unique_ptr compactRow; + std::unique_ptr denseRow; + int64_t numRows{0}; + int64_t totalMemorySize{0}; }; RowVectorWithStats getWithStats( @@ -125,12 +134,12 @@ class ShuffleColumnarToRowConverter { private: void init(const bytedance::bolt::RowTypePtr& rowType); - int32_t fixedRowSize_ = 0; uint8_t* bufferAddress_; int64_t totalBufferSize_{0}; size_t averageRowSize_{0}; bytedance::bolt::memory::MemoryPool* boltPool_; + bytedance::bolt::row::RowFormat rowFormat_; std::vector boltBuffers_; }; diff --git a/bolt/shuffle/sparksql/ShuffleReaderNode.cpp b/bolt/shuffle/sparksql/ShuffleReaderNode.cpp index 8e8272cf6..c7f652ca6 100644 --- a/bolt/shuffle/sparksql/ShuffleReaderNode.cpp +++ b/bolt/shuffle/sparksql/ShuffleReaderNode.cpp @@ -46,7 +46,8 @@ SparkShuffleReader::SparkShuffleReader( rowBufferPool_(std::make_shared(arrowPool_.get())), row2ColConverter_(std::make_shared( outputType_, - pool())) { + pool(), + shuffleReaderOptions_.rowFormat)) { isValidityBuffer_.reserve(outputType_->size()); for (size_t i = 0; i < outputType_->size(); ++i) { switch (outputType_->childAt(i)->kind()) { diff --git a/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.cpp b/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.cpp index 3979bbf26..e2d0c4000 100644 --- a/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.cpp +++ b/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.cpp @@ -31,18 +31,22 @@ #include "bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h" #include "bolt/row/CompactRow.h" +#include "bolt/row/dense/DenseRow.h" #include "bolt/vector/arrow/Bridge.h" using namespace bytedance::bolt; namespace bytedance::bolt::shuffle::sparksql { ShuffleRowToColumnarConverter::ShuffleRowToColumnarConverter( const bytedance::bolt::RowTypePtr& rowType, - memory::MemoryPool* memoryPool) - : rowType_(rowType), pool_(memoryPool) {} + memory::MemoryPool* memoryPool, + bytedance::bolt::row::RowFormat rowFormat) + : rowType_(rowType), pool_(memoryPool), rowFormat_(rowFormat) {} RowVectorPtr ShuffleRowToColumnarConverter::convert( std::vector& rows) { - auto vp = row::CompactRow::deserialize(rows, rowType_, pool_); - return std::dynamic_pointer_cast(vp); + if (rowFormat_ == row::RowFormat::COMPACT) { + return row::CompactRow::deserialize(rows, rowType_, pool_); + } + return row::DenseRow::deserialize(rows, rowType_, pool_); } RowVectorPtr ShuffleRowToColumnarConverter::convertToComposite( diff --git a/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h b/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h index 74f3e31f7..0eec9b791 100644 --- a/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h +++ b/bolt/shuffle/sparksql/ShuffleRowToColumnarConverter.h @@ -33,6 +33,7 @@ #include #include "bolt/common/memory/Memory.h" +#include "bolt/row/RowFormat.h" #include "bolt/type/Type.h" #include "bolt/vector/ComplexVector.h" namespace bytedance::bolt::shuffle::sparksql { @@ -41,7 +42,9 @@ class ShuffleRowToColumnarConverter { public: ShuffleRowToColumnarConverter( const bytedance::bolt::RowTypePtr& rowType, - bytedance::bolt::memory::MemoryPool* memoryPool); + bytedance::bolt::memory::MemoryPool* memoryPool, + bytedance::bolt::row::RowFormat rowFormat = + bytedance::bolt::row::RowFormat::COMPACT); bytedance::bolt::RowVectorPtr convert(std::vector& rows); @@ -52,6 +55,7 @@ class ShuffleRowToColumnarConverter { protected: bytedance::bolt::RowTypePtr rowType_; bytedance::bolt::memory::MemoryPool* pool_; + bytedance::bolt::row::RowFormat rowFormat_; }; } // namespace bytedance::bolt::shuffle::sparksql diff --git a/bolt/shuffle/sparksql/tests/ShuffleMatrixTest.cpp b/bolt/shuffle/sparksql/tests/ShuffleMatrixTest.cpp index 86415e7db..993552c46 100644 --- a/bolt/shuffle/sparksql/tests/ShuffleMatrixTest.cpp +++ b/bolt/shuffle/sparksql/tests/ShuffleMatrixTest.cpp @@ -42,7 +42,16 @@ std::vector buildShuffleParams() { dataTypeGroup, numPartitions, numMappers}; - if (param.isSupported()) { + if (!param.isSupported()) { + continue; + } + if (shuffleMode == 3) { + // RowBased: round-trip both on-wire row formats. + param.rowFormat = bytedance::bolt::row::RowFormat::DENSE; + params.push_back(param); + param.rowFormat = bytedance::bolt::row::RowFormat::COMPACT; + params.push_back(param); + } else { params.push_back(param); } } diff --git a/bolt/shuffle/sparksql/tests/ShuffleTestBase.cpp b/bolt/shuffle/sparksql/tests/ShuffleTestBase.cpp index badaa582d..5eb1a8a4d 100644 --- a/bolt/shuffle/sparksql/tests/ShuffleTestBase.cpp +++ b/bolt/shuffle/sparksql/tests/ShuffleTestBase.cpp @@ -161,14 +161,16 @@ std::string ShuffleTestParam::toString() const { auto memStr = fmt::format("{}{}", v, units[u]); return fmt::format( - "{}_{}_{}_{}_M{}_P{}_{}", + "{}_{}_{}_{}_M{}_P{}_{}_{}", partitioning, shuffleModeToString(shuffleMode), writerTypeToString(writerType), dataTypeGroupToString(dataTypeGroup), numMappers, numPartitions, - memStr); + memStr, + rowFormat == bytedance::bolt::row::RowFormat::COMPACT ? "Compact" + : "Dense"); } bool ShuffleTestParam::isSupported() const { @@ -513,6 +515,7 @@ ShuffleRunResult ShuffleTestBase::runShuffle( writerOptions.partitioning = toPartitioning(param.partitioning); writerOptions.partitionWriterOptions.numPartitions = param.numPartitions; writerOptions.forceShuffleWriterType = param.shuffleMode; + writerOptions.rowFormat = param.rowFormat; writerOptions.partitionWriterOptions.partitionWriterType = param.writerType; writerOptions.taskAttemptId = memoryManagerHolder->taskAttemptId(); writerOptions.partitionWriterOptions.shuffleBufferSize = @@ -640,6 +643,7 @@ ShuffleRunResult ShuffleTestBase::runShuffle( ShuffleReaderOptions readerOptions; readerOptions.numPartitions = param.numPartitions; readerOptions.forceShuffleWriterType = param.shuffleMode; + readerOptions.rowFormat = param.rowFormat; readerOptions.partitionShortName = param.partitioning; readerOptions.shuffleBatchByteSize = 1024 * 1024; // 1MB diff --git a/bolt/shuffle/sparksql/tests/ShuffleTestBase.h b/bolt/shuffle/sparksql/tests/ShuffleTestBase.h index 204b265ff..f0552a190 100644 --- a/bolt/shuffle/sparksql/tests/ShuffleTestBase.h +++ b/bolt/shuffle/sparksql/tests/ShuffleTestBase.h @@ -67,6 +67,9 @@ struct ShuffleTestParam { int32_t numBatches = 4; int32_t shuffleBufferSize = kDefaultShuffleWriterBufferSize; bool verifyOutput = true; + // On-wire row format for the RowBased writer (shuffleMode == 3); ignored by + // the other modes. Threaded into both writer and reader options. + row::RowFormat rowFormat = bytedance::bolt::row::RowFormat::DENSE; std::string toString() const;