diff --git a/dbms/src/DataStreams/ScanHashMapAfterProbeBlockInputStream.cpp b/dbms/src/DataStreams/ScanHashMapAfterProbeBlockInputStream.cpp index b4523c913ef..884c43a53f8 100644 --- a/dbms/src/DataStreams/ScanHashMapAfterProbeBlockInputStream.cpp +++ b/dbms/src/DataStreams/ScanHashMapAfterProbeBlockInputStream.cpp @@ -258,6 +258,12 @@ Block ScanHashMapAfterProbeBlockInputStream::readImpl() else fillColumnsUsingCurrentPartition(columns_left, columns_right, row_counter_column); break; + case ASTTableJoin::Kind::Full: + if (parent.has_other_condition) + fillColumnsUsingCurrentPartition(columns_left, columns_right, row_counter_column); + else + fillColumnsUsingCurrentPartition(columns_left, columns_right, row_counter_column); + break; case ASTTableJoin::Kind::RightAnti: case ASTTableJoin::Kind::RightOuter: if (parent.has_other_condition) diff --git a/dbms/src/Debug/MockExecutor/JoinBinder.cpp b/dbms/src/Debug/MockExecutor/JoinBinder.cpp index 01011dd4e66..3f010b48975 100644 --- a/dbms/src/Debug/MockExecutor/JoinBinder.cpp +++ b/dbms/src/Debug/MockExecutor/JoinBinder.cpp @@ -19,11 +19,63 @@ #include #include #include +#include #include #include namespace DB::mock { +namespace +{ +void appendJoinSchema(DAGSchema & output_schema, const DAGSchema & input_schema, bool make_nullable) +{ + for (const auto & field : input_schema) + { + if (make_nullable && field.second.hasNotNullFlag()) + output_schema.push_back(toNullableDAGColumnInfo(field)); + else + output_schema.push_back(field); + } +} + +void buildLeftSideJoinSchema(DAGSchema & schema, const DAGSchema & left_schema, tipb::JoinType tp) +{ + appendJoinSchema(schema, left_schema, JoinInterpreterHelper::makeLeftJoinSideNullable(tp)); +} + +void buildRightSideJoinSchema(DAGSchema & schema, const DAGSchema & right_schema, tipb::JoinType tp) +{ + /// Note: for semi join, the right table column is ignored + /// but for (anti) left outer semi join, a 1/0 (uint8) field is pushed back + /// indicating whether right table has matching row(s), see comment in ASTTableJoin::Kind for details. + if (tp == tipb::JoinType::TypeLeftOuterSemiJoin || tp == tipb::JoinType::TypeAntiLeftOuterSemiJoin) + { + tipb::FieldType field_type{}; + field_type.set_tp(TiDB::TypeTiny); + field_type.set_charset("binary"); + field_type.set_collate(TiDB::ITiDBCollator::BINARY); + field_type.set_flag(0); + field_type.set_flen(-1); + field_type.set_decimal(-1); + schema.push_back(std::make_pair("", TiDB::fieldTypeToColumnInfo(field_type))); + } + else if (tp != tipb::JoinType::TypeSemiJoin && tp != tipb::JoinType::TypeAntiSemiJoin) + { + appendJoinSchema(schema, right_schema, JoinInterpreterHelper::makeRightJoinSideNullable(tp)); + } +} + +DAGSchema buildOtherConditionSchema( + const DAGSchema & left_schema, + const DAGSchema & right_schema, + tipb::JoinType join_type) +{ + DAGSchema merged_children_schema; + appendJoinSchema(merged_children_schema, left_schema, JoinInterpreterHelper::makeLeftJoinSideNullable(join_type)); + appendJoinSchema(merged_children_schema, right_schema, JoinInterpreterHelper::makeRightJoinSideNullable(join_type)); + return merged_children_schema; +} +} // namespace void JoinBinder::addRuntimeFilter(MockRuntimeFilter & rf) { @@ -95,22 +147,8 @@ void JoinBinder::columnPrune(std::unordered_set & used_columns) /// update output schema output_schema.clear(); - - for (auto & field : children[0]->output_schema) - { - if (tp == tipb::TypeRightOuterJoin && field.second.hasNotNullFlag()) - output_schema.push_back(toNullableDAGColumnInfo(field)); - else - output_schema.push_back(field); - } - - for (auto & field : children[1]->output_schema) - { - if (tp == tipb::TypeLeftOuterJoin && field.second.hasNotNullFlag()) - output_schema.push_back(toNullableDAGColumnInfo(field)); - else - output_schema.push_back(field); - } + buildLeftSideJoinSchema(output_schema, children[0]->output_schema, tp); + buildRightSideJoinSchema(output_schema, children[1]->output_schema, tp); } void JoinBinder::fillJoinKeyAndFieldType( @@ -187,11 +225,8 @@ bool JoinBinder::toTiPBExecutor( astToPB(children[1]->output_schema, expr, cond, collator_id, context); } - DAGSchema merged_children_schema{children[0]->output_schema}; - merged_children_schema.insert( - merged_children_schema.end(), - children[1]->output_schema.begin(), - children[1]->output_schema.end()); + DAGSchema merged_children_schema + = buildOtherConditionSchema(children[0]->output_schema, children[1]->output_schema, tp); for (const auto & expr : other_conds) { @@ -293,45 +328,6 @@ void JoinBinder::toMPPSubPlan( exchange_map[right_exchange_receiver->name] = std::make_pair(right_exchange_receiver, right_exchange_sender); } -static void buildLeftSideJoinSchema(DAGSchema & schema, const DAGSchema & left_schema, tipb::JoinType tp) -{ - for (const auto & field : left_schema) - { - if (tp == tipb::JoinType::TypeRightOuterJoin && field.second.hasNotNullFlag()) - schema.push_back(toNullableDAGColumnInfo(field)); - else - schema.push_back(field); - } -} - -static void buildRightSideJoinSchema(DAGSchema & schema, const DAGSchema & right_schema, tipb::JoinType tp) -{ - /// Note: for semi join, the right table column is ignored - /// but for (anti) left outer semi join, a 1/0 (uint8) field is pushed back - /// indicating whether right table has matching row(s), see comment in ASTTableJoin::Kind for details. - if (tp == tipb::JoinType::TypeLeftOuterSemiJoin || tp == tipb::JoinType::TypeAntiLeftOuterSemiJoin) - { - tipb::FieldType field_type{}; - field_type.set_tp(TiDB::TypeTiny); - field_type.set_charset("binary"); - field_type.set_collate(TiDB::ITiDBCollator::BINARY); - field_type.set_flag(0); - field_type.set_flen(-1); - field_type.set_decimal(-1); - schema.push_back(std::make_pair("", TiDB::fieldTypeToColumnInfo(field_type))); - } - else if (tp != tipb::JoinType::TypeSemiJoin && tp != tipb::JoinType::TypeAntiSemiJoin) - { - for (const auto & field : right_schema) - { - if (tp == tipb::JoinType::TypeLeftOuterJoin && field.second.hasNotNullFlag()) - schema.push_back(toNullableDAGColumnInfo(field)); - else - schema.push_back(field); - } - } -} - // compileJoin constructs a mocked Join executor node, note that all conditional expression params can be default ExecutorBinderPtr compileJoin( size_t & executor_index, diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index fc8671c5aa8..c9057de9640 100644 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -881,6 +881,8 @@ String getJoinTypeName(const tipb::JoinType & tp) return "LeftOuterJoin"; case tipb::JoinType::TypeRightOuterJoin: return "RightOuterJoin"; + case tipb::JoinType::TypeFullOuterJoin: + return "FullOuterJoin"; case tipb::JoinType::TypeLeftOuterSemiJoin: return "LeftOuterSemiJoin"; case tipb::JoinType::TypeAntiSemiJoin: diff --git a/dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp b/dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp index 10f00045eac..a4e5753dadd 100644 --- a/dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp +++ b/dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp @@ -60,6 +60,7 @@ std::pair getJoinKindAndBuildSideIndex( /// 3. for non-cross left/right outer join, there is no problem in this swap. /// 4. for cross left outer join, the build side is always right, needn't and can't swap. /// 5. for cross right outer join, the build side is always left, so it will always swap and change to cross left outer join. + /// 6. for non-cross full outer join, keep full join kind and respect inner_idx as build side. /// note that whatever the build side is, we can't support cross-right-outer join now. static const std::unordered_map< std::pair, @@ -72,6 +73,8 @@ std::pair getJoinKindAndBuildSideIndex( {{tipb::JoinType::TypeLeftOuterJoin, 1}, {ASTTableJoin::Kind::LeftOuter, 1}}, {{tipb::JoinType::TypeRightOuterJoin, 0}, {ASTTableJoin::Kind::LeftOuter, 0}}, {{tipb::JoinType::TypeRightOuterJoin, 1}, {ASTTableJoin::Kind::RightOuter, 1}}, + {{tipb::JoinType::TypeFullOuterJoin, 0}, {ASTTableJoin::Kind::Full, 0}}, + {{tipb::JoinType::TypeFullOuterJoin, 1}, {ASTTableJoin::Kind::Full, 1}}, {{tipb::JoinType::TypeSemiJoin, 0}, {ASTTableJoin::Kind::RightSemi, 0}}, {{tipb::JoinType::TypeSemiJoin, 1}, {ASTTableJoin::Kind::Semi, 1}}, {{tipb::JoinType::TypeAntiSemiJoin, 0}, {ASTTableJoin::Kind::RightAnti, 0}}, @@ -103,6 +106,8 @@ std::pair getJoinKindAndBuildSideIndex( {{tipb::JoinType::TypeAntiLeftOuterSemiJoin, 1}, {ASTTableJoin::Kind::NullAware_LeftOuterAnti, 1}}}; RUNTIME_ASSERT(inner_index == 0 || inner_index == 1); + if (unlikely(tipb_join_type == tipb::JoinType::TypeFullOuterJoin && join_keys_size == 0)) + throw TiFlashException("Cartesian full outer join is not supported yet", Errors::Coprocessor::BadRequest); const auto & join_type_map = [is_null_aware, join_keys_size]() { if (is_null_aware) { @@ -295,8 +300,8 @@ NamesAndTypes TiFlashJoin::genColumnsForOtherJoinFilter( column_set_for_origin_columns.emplace(p.name); } }; - append_origin_columns(left_cols, join.join_type() == tipb::JoinType::TypeRightOuterJoin); - append_origin_columns(right_cols, join.join_type() == tipb::JoinType::TypeLeftOuterJoin); + append_origin_columns(left_cols, makeLeftJoinSideNullable(join.join_type())); + append_origin_columns(right_cols, makeRightJoinSideNullable(join.join_type())); /// append the columns generated by probe side prepare join actions. /// the new columns are @@ -310,8 +315,8 @@ NamesAndTypes TiFlashJoin::genColumnsForOtherJoinFilter( columns_for_other_join_filter.emplace_back(p.name, make_nullable ? makeNullable(p.type) : p.type); } }; - bool make_nullable = build_side_index == 1 ? join.join_type() == tipb::JoinType::TypeRightOuterJoin - : join.join_type() == tipb::JoinType::TypeLeftOuterJoin; + bool make_nullable = build_side_index == 1 ? makeLeftJoinSideNullable(join.join_type()) + : makeRightJoinSideNullable(join.join_type()); append_new_columns(probe_prepare_join_actions->getSampleBlock(), make_nullable); return columns_for_other_join_filter; @@ -330,11 +335,11 @@ NamesAndTypes TiFlashJoin::genJoinOutputColumns( } }; - append_output_columns(left_cols, join.join_type() == tipb::JoinType::TypeRightOuterJoin); + append_output_columns(left_cols, makeLeftJoinSideNullable(join.join_type())); if (!isSemiFamily() && !isLeftOuterSemiFamily()) { /// for (left outer) semi join, the columns from right table will be ignored - append_output_columns(right_cols, join.join_type() == tipb::JoinType::TypeLeftOuterJoin); + append_output_columns(right_cols, makeRightJoinSideNullable(join.join_type())); } if (!match_helper_name.empty()) diff --git a/dbms/src/Flash/Coprocessor/JoinInterpreterHelper.h b/dbms/src/Flash/Coprocessor/JoinInterpreterHelper.h index 4745c6e34ce..6a6483040bd 100644 --- a/dbms/src/Flash/Coprocessor/JoinInterpreterHelper.h +++ b/dbms/src/Flash/Coprocessor/JoinInterpreterHelper.h @@ -94,9 +94,9 @@ struct JoinNonEqualConditions /// Validate this JoinNonEqualConditions and return error message if any. const char * validate(ASTTableJoin::Kind kind) const { - if unlikely (!left_filter_column.empty() && !isLeftOuterJoin(kind)) + if unlikely (!left_filter_column.empty() && !(isLeftOuterJoin(kind) || kind == ASTTableJoin::Kind::Full)) return "non left join with left conditions"; - if unlikely (!right_filter_column.empty() && !isRightOuterJoin(kind)) + if unlikely (!right_filter_column.empty() && !(isRightOuterJoin(kind) || kind == ASTTableJoin::Kind::Full)) return "non right join with right conditions"; if unlikely ((!other_cond_name.empty() || !other_eq_cond_from_in_name.empty()) && other_cond_expr == nullptr) @@ -119,6 +119,16 @@ struct JoinNonEqualConditions namespace JoinInterpreterHelper { +constexpr bool makeLeftJoinSideNullable(tipb::JoinType join_type) +{ + return join_type == tipb::JoinType::TypeRightOuterJoin || join_type == tipb::JoinType::TypeFullOuterJoin; +} + +constexpr bool makeRightJoinSideNullable(tipb::JoinType join_type) +{ + return join_type == tipb::JoinType::TypeLeftOuterJoin || join_type == tipb::JoinType::TypeFullOuterJoin; +} + struct TiFlashJoin { TiFlashJoin(const tipb::Join & join_, bool is_test); diff --git a/dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp b/dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp index 6c9ec8d1ed0..ddfbd1f89aa 100644 --- a/dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp +++ b/dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -172,9 +173,9 @@ bool collectForJoin(std::vector & output_field_types, const tip // collect output_field_types for join self for (auto & field_type : children_output_field_types[0]) { - if (executor.join().join_type() == tipb::JoinType::TypeRightOuterJoin) + if (JoinInterpreterHelper::makeLeftJoinSideNullable(executor.join().join_type())) { - /// the type of left column for right join is always nullable + /// the type of left column for right/full join is always nullable auto updated_field_type = field_type; updated_field_type.set_flag( static_cast(updated_field_type.flag()) & (~static_cast(TiDB::ColumnFlagNotNull))); @@ -210,9 +211,9 @@ bool collectForJoin(std::vector & output_field_types, const tip /// for semi/anti semi join, the right table column is ignored for (auto & field_type : children_output_field_types[1]) { - if (executor.join().join_type() == tipb::JoinType::TypeLeftOuterJoin) + if (JoinInterpreterHelper::makeRightJoinSideNullable(executor.join().join_type())) { - /// the type of right column for left join is always nullable + /// the type of right column for left/full join is always nullable auto updated_field_type = field_type; updated_field_type.set_flag( updated_field_type.flag() & (~static_cast(TiDB::ColumnFlagNotNull))); diff --git a/dbms/src/Flash/Coprocessor/tests/gtest_join_get_kind_and_build_index.cpp b/dbms/src/Flash/Coprocessor/tests/gtest_join_get_kind_and_build_index.cpp index 4cfa8db2369..62ce76f3e09 100644 --- a/dbms/src/Flash/Coprocessor/tests/gtest_join_get_kind_and_build_index.cpp +++ b/dbms/src/Flash/Coprocessor/tests/gtest_join_get_kind_and_build_index.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -26,6 +27,26 @@ class JoinKindAndBuildIndexTestRunner : public testing::Test { }; +namespace +{ +tipb::Expr makeJoinKeyWithFieldType() +{ + tipb::Expr expr; + expr.mutable_field_type()->set_tp(TiDB::TypeLong); + return expr; +} + +tipb::Join makeFullOuterJoinForSchemaTest(size_t inner_index) +{ + tipb::Join join; + join.set_join_type(tipb::JoinType::TypeFullOuterJoin); + join.set_inner_idx(inner_index); + *join.add_left_join_keys() = makeJoinKeyWithFieldType(); + *join.add_right_join_keys() = makeJoinKeyWithFieldType(); + return join; +} +} // namespace + bool invalidParams(tipb::JoinType tipb_join_type, size_t inner_index, bool is_null_aware, size_t join_keys_size) { try @@ -39,6 +60,19 @@ bool invalidParams(tipb::JoinType tipb_join_type, size_t inner_index, bool is_nu } } +String getErrorMessage(tipb::JoinType tipb_join_type, size_t inner_index, bool is_null_aware, size_t join_keys_size) +{ + try + { + JoinInterpreterHelper::getJoinKindAndBuildSideIndex(tipb_join_type, inner_index, is_null_aware, join_keys_size); + return ""; + } + catch (Exception & e) + { + return e.message(); + } +} + TEST(JoinKindAndBuildIndexTestRunner, TestNullAwareJoins) { auto result = JoinInterpreterHelper::getJoinKindAndBuildSideIndex(tipb::JoinType::TypeAntiSemiJoin, 1, true, 1); @@ -97,6 +131,11 @@ TEST(JoinKindAndBuildIndexTestRunner, TestCrossJoins) ASSERT_TRUE(invalidParams(tipb::JoinType::TypeLeftOuterSemiJoin, 0, false, 0)); ASSERT_TRUE(invalidParams(tipb::JoinType::TypeAntiLeftOuterSemiJoin, 0, false, 0)); + + /// Cross FullOuterJoin is out of scope in this round and should fail with a clear message. + auto error_message = getErrorMessage(tipb::JoinType::TypeFullOuterJoin, 0, false, 0); + ASSERT_FALSE(error_message.empty()); + ASSERT_NE(error_message.find("Cartesian full outer join"), String::npos); } TEST(JoinKindAndBuildIndexTestRunner, TestEqualJoins) @@ -119,6 +158,12 @@ TEST(JoinKindAndBuildIndexTestRunner, TestEqualJoins) result = JoinInterpreterHelper::getJoinKindAndBuildSideIndex(tipb::JoinType::TypeRightOuterJoin, 1, false, 1); ASSERT_TRUE(result.first == ASTTableJoin::Kind::RightOuter && result.second == 1); + /// FullOuterJoin, keep full join kind and respect inner_idx as build side. + result = JoinInterpreterHelper::getJoinKindAndBuildSideIndex(tipb::JoinType::TypeFullOuterJoin, 0, false, 1); + ASSERT_TRUE(result.first == ASTTableJoin::Kind::Full && result.second == 0); + result = JoinInterpreterHelper::getJoinKindAndBuildSideIndex(tipb::JoinType::TypeFullOuterJoin, 1, false, 1); + ASSERT_TRUE(result.first == ASTTableJoin::Kind::Full && result.second == 1); + /// Semi/Anti result = JoinInterpreterHelper::getJoinKindAndBuildSideIndex(tipb::JoinType::TypeSemiJoin, 1, false, 1); ASSERT_TRUE(result.first == ASTTableJoin::Kind::Semi && result.second == 1); @@ -140,5 +185,63 @@ TEST(JoinKindAndBuildIndexTestRunner, TestEqualJoins) ASSERT_TRUE(invalidParams(tipb::JoinType::TypeAntiLeftOuterSemiJoin, 0, false, 1)); } +TEST(JoinKindAndBuildIndexTestRunner, TestFullJoinOutputColumnsAreNullable) +{ + auto join = makeFullOuterJoinForSchemaTest(1); + JoinInterpreterHelper::TiFlashJoin tiflash_join(join, false); + + auto int_type = std::make_shared(); + NamesAndTypes left_cols{{"l.a", int_type}, {"l.b", int_type}}; + NamesAndTypes right_cols{{"r.a", int_type}, {"r.b", int_type}}; + + auto join_output_columns = tiflash_join.genJoinOutputColumns(left_cols, right_cols, ""); + ASSERT_EQ(join_output_columns.size(), 4); + for (const auto & column : join_output_columns) + ASSERT_TRUE(column.type->isNullable()) << column.name; +} + +TEST(JoinKindAndBuildIndexTestRunner, TestFullJoinOtherConditionColumnsAreNullable) +{ + auto int_type = std::make_shared(); + NamesAndTypes left_cols{{"l.a", int_type}, {"l.b", int_type}}; + NamesAndTypes right_cols{{"r.a", int_type}, {"r.b", int_type}}; + + for (size_t inner_index : {size_t{0}, size_t{1}}) + { + auto join = makeFullOuterJoinForSchemaTest(inner_index); + JoinInterpreterHelper::TiFlashJoin tiflash_join(join, false); + + NamesAndTypes probe_prepare_columns = inner_index == 1 + ? NamesAndTypes{{"l.a", int_type}, {"l.b", int_type}, {"probe_extra", int_type}} + : NamesAndTypes{{"r.a", int_type}, {"r.b", int_type}, {"probe_extra", int_type}}; + auto probe_prepare_join_actions = std::make_shared(probe_prepare_columns); + + auto columns_for_other_join_filter + = tiflash_join.genColumnsForOtherJoinFilter(left_cols, right_cols, probe_prepare_join_actions); + ASSERT_EQ(columns_for_other_join_filter.size(), 5); + ASSERT_EQ(columns_for_other_join_filter.back().name, "probe_extra"); + for (const auto & column : columns_for_other_join_filter) + ASSERT_TRUE(column.type->isNullable()) << column.name; + } +} + +TEST(JoinKindAndBuildIndexTestRunner, TestFullJoinAllowsLeftAndRightConditions) +{ + JoinNonEqualConditions full_conditions; + full_conditions.left_filter_column = "left_cond"; + full_conditions.right_filter_column = "right_cond"; + ASSERT_EQ(full_conditions.validate(ASTTableJoin::Kind::Full), nullptr); + + JoinNonEqualConditions left_only_conditions; + left_only_conditions.left_filter_column = "left_cond"; + ASSERT_EQ(left_only_conditions.validate(ASTTableJoin::Kind::LeftOuter), nullptr); + ASSERT_STREQ(left_only_conditions.validate(ASTTableJoin::Kind::Inner), "non left join with left conditions"); + + JoinNonEqualConditions right_only_conditions; + right_only_conditions.right_filter_column = "right_cond"; + ASSERT_EQ(right_only_conditions.validate(ASTTableJoin::Kind::RightOuter), nullptr); + ASSERT_STREQ(right_only_conditions.validate(ASTTableJoin::Kind::Inner), "non right join with right conditions"); +} + } // namespace tests } // namespace DB diff --git a/dbms/src/Flash/tests/gtest_join_executor.cpp b/dbms/src/Flash/tests/gtest_join_executor.cpp index 3f5ddcc5143..fe65fdeadd4 100644 --- a/dbms/src/Flash/tests/gtest_join_executor.cpp +++ b/dbms/src/Flash/tests/gtest_join_executor.cpp @@ -5130,6 +5130,210 @@ try } CATCH +TEST_F(JoinExecutorTestRunner, FullOuterJoinWithOtherCondition) +try +{ + using tipb::JoinType; + + const std::vector> + test_cases + = {{ + {toNullableVec("a", {1}), toNullableVec("c", {10})}, + {toNullableVec("a", {1}), toNullableVec("c", {5})}, + {toNullableVec("a", {1, {}}), + toNullableVec("c", {10, {}}), + toNullableVec("a", {{}, 1}), + toNullableVec("c", {{}, 5})}, + "key matched but other condition failed for all rows", + }, + { + {toNullableVec("a", {2}), toNullableVec("c", {20})}, + {toNullableVec("a", {1}), toNullableVec("c", {5})}, + {toNullableVec("a", {2, {}}), + toNullableVec("c", {20, {}}), + toNullableVec("a", {{}, 1}), + toNullableVec("c", {{}, 5})}, + "key not matched", + }, + { + {toNullableVec("a", {{}}), toNullableVec("c", {10})}, + {toNullableVec("a", {1}), toNullableVec("c", {5})}, + {toNullableVec("a", {{}, {}}), + toNullableVec("c", {10, {}}), + toNullableVec("a", {{}, 1}), + toNullableVec("c", {{}, 5})}, + "probe-side null key should still be emitted as unmatched row", + }, + { + {toNullableVec("a", {1}), toNullableVec("c", {3})}, + {toNullableVec("a", {1, 1}), toNullableVec("c", {2, 4})}, + {toNullableVec("a", {1, {}}), + toNullableVec("c", {3, {}}), + toNullableVec("a", {1, 1}), + toNullableVec("c", {4, 2})}, + "only rows that pass other condition should be marked used", + }}; + + for (const auto & [left, right, res, test_case_name] : test_cases) + { + SCOPED_TRACE(test_case_name); + context.addMockTable( + "full_outer_other_condition", + "t", + {{"a", TiDB::TP::TypeLong}, {"c", TiDB::TP::TypeLong}}, + left); + context.addMockTable( + "full_outer_other_condition", + "s", + {{"a", TiDB::TP::TypeLong}, {"c", TiDB::TP::TypeLong}}, + right); + + auto request = context.scan("full_outer_other_condition", "t") + .join( + context.scan("full_outer_other_condition", "s"), + JoinType::TypeFullOuterJoin, + {col("a")}, + {}, + {}, + {lt(col("t.c"), col("s.c"))}, + {}, + 0, + false, + 1) + .build(context); + executeAndAssertColumnsEqual(request, res); + + auto request_column_prune = context.scan("full_outer_other_condition", "t") + .join( + context.scan("full_outer_other_condition", "s"), + JoinType::TypeFullOuterJoin, + {col("a")}, + {}, + {}, + {lt(col("t.c"), col("s.c"))}, + {}, + 0, + false, + 1) + .aggregation({Count(lit(static_cast(1)))}, {}) + .build(context); + ASSERT_COLUMNS_EQ_UR(genScalarCountResults(res), executeStreams(request_column_prune, 2)); + } +} +CATCH + +TEST_F(JoinExecutorTestRunner, FullOuterJoinWithoutOtherConditionAndNullKey) +try +{ + using tipb::JoinType; + + context.addMockTable( + "full_outer_no_other_condition", + "t", + {{"a", TiDB::TP::TypeLong}, {"c", TiDB::TP::TypeLong}}, + {toNullableVec("a", {1, 2, {}, 4}), toNullableVec("c", {10, 20, 30, 40})}); + context.addMockTable( + "full_outer_no_other_condition", + "s", + {{"a", TiDB::TP::TypeLong}, {"c", TiDB::TP::TypeLong}}, + {toNullableVec("a", {1, 3, {}}), toNullableVec("c", {100, 300, 400})}); + + auto request = context.scan("full_outer_no_other_condition", "t") + .join( + context.scan("full_outer_no_other_condition", "s"), + JoinType::TypeFullOuterJoin, + {col("a")}, + {}, + {}, + {}, + {}, + 0, + false, + 1) + .build(context); + + const ColumnsWithTypeAndName expected = { + toNullableVec({1, 2, {}, 4, {}, {}}), + toNullableVec({10, 20, 30, 40, {}, {}}), + toNullableVec({1, {}, {}, {}, {}, 3}), + toNullableVec({100, {}, {}, {}, 400, 300}), + }; + executeAndAssertColumnsEqual(request, expected); + + auto request_column_prune = context.scan("full_outer_no_other_condition", "t") + .join( + context.scan("full_outer_no_other_condition", "s"), + JoinType::TypeFullOuterJoin, + {col("a")}, + {}, + {}, + {}, + {}, + 0, + false, + 1) + .aggregation({Count(lit(static_cast(1)))}, {}) + .build(context); + ASSERT_COLUMNS_EQ_UR(genScalarCountResults(expected), executeStreams(request_column_prune, 2)); +} +CATCH + +TEST_F(JoinExecutorTestRunner, FullOuterJoinWithLeftAndRightConditions) +try +{ + using tipb::JoinType; + + context.addMockTable( + "full_outer_with_side_conditions", + "t", + {{"a", TiDB::TP::TypeLong}, {"c", TiDB::TP::TypeLong}}, + {toNullableVec("a", {1, 2}), toNullableVec("c", {5, 20})}); + context.addMockTable( + "full_outer_with_side_conditions", + "s", + {{"a", TiDB::TP::TypeLong}, {"c", TiDB::TP::TypeLong}}, + {toNullableVec("a", {1, 3}), toNullableVec("c", {200, 50})}); + + auto request = context.scan("full_outer_with_side_conditions", "t") + .join( + context.scan("full_outer_with_side_conditions", "s"), + JoinType::TypeFullOuterJoin, + {col("a")}, + {gt(col("t.c"), lit(Field(static_cast(10))))}, + {gt(col("s.c"), lit(Field(static_cast(100))))}, + {}, + {}, + 0, + false, + 1) + .build(context); + + const ColumnsWithTypeAndName expected = { + toNullableVec({1, 2, {}, {}}), + toNullableVec({5, 20, {}, {}}), + toNullableVec({{}, {}, 3, 1}), + toNullableVec({{}, {}, 50, 200}), + }; + executeAndAssertColumnsEqual(request, expected); + + auto request_column_prune = context.scan("full_outer_with_side_conditions", "t") + .join( + context.scan("full_outer_with_side_conditions", "s"), + JoinType::TypeFullOuterJoin, + {col("a")}, + {gt(col("t.c"), lit(Field(static_cast(10))))}, + {gt(col("s.c"), lit(Field(static_cast(100))))}, + {}, + {}, + 0, + false, + 1) + .aggregation({Count(lit(static_cast(1)))}, {}) + .build(context); + ASSERT_COLUMNS_EQ_UR(genScalarCountResults(expected), executeStreams(request_column_prune, 2)); +} +CATCH + #undef WRAP_FOR_JOIN_TEST_BEGIN #undef WRAP_FOR_JOIN_TEST_END diff --git a/dbms/src/Flash/tests/gtest_spill_join.cpp b/dbms/src/Flash/tests/gtest_spill_join.cpp index 99058189bf3..790b46889ef 100644 --- a/dbms/src/Flash/tests/gtest_spill_join.cpp +++ b/dbms/src/Flash/tests/gtest_spill_join.cpp @@ -578,6 +578,62 @@ try } CATCH +TEST_F(SpillJoinTestRunner, FullOuterJoinWithOtherConditionSpill) +try +{ + UInt64 max_block_size = 800; + size_t original_max_streams = 20; + UInt64 max_bytes_before_external_join = 20000; + String left_table_name = "left_table_10_concurrency"; + String right_table_name = "right_table_10_concurrency"; + + WRAP_FOR_SPILL_TEST_BEGIN + auto request = context.scan("outer_join_test", left_table_name) + .join( + context.scan("outer_join_test", right_table_name), + tipb::JoinType::TypeFullOuterJoin, + {col("a")}, + {}, + {}, + {lt(col(left_table_name + ".b"), col(right_table_name + ".b"))}, + {}, + 0, + false, + 1) + .project( + {fmt::format("{}.a", left_table_name), + fmt::format("{}.b", left_table_name), + fmt::format("{}.a", right_table_name), + fmt::format("{}.b", right_table_name)}) + .build(context); + auto request_column_prune = context.scan("outer_join_test", left_table_name) + .join( + context.scan("outer_join_test", right_table_name), + tipb::JoinType::TypeFullOuterJoin, + {col("a")}, + {}, + {}, + {lt(col(left_table_name + ".b"), col(right_table_name + ".b"))}, + {}, + 0, + false, + 1) + .aggregation({Count(lit(static_cast(1)))}, {}) + .build(context); + + context.context->setSetting("max_block_size", Field(static_cast(max_block_size))); + context.context->setSetting("max_bytes_before_external_join", Field(static_cast(0))); + auto ref_columns = executeStreams(request, original_max_streams); + + context.context->setSetting( + "max_bytes_before_external_join", + Field(static_cast(max_bytes_before_external_join))); + ASSERT_COLUMNS_EQ_UR(ref_columns, executeStreams(request, original_max_streams)); + ASSERT_COLUMNS_EQ_UR(genScalarCountResults(ref_columns), executeStreams(request_column_prune, 2)); + WRAP_FOR_SPILL_TEST_END +} +CATCH + #undef WRAP_FOR_SPILL_TEST_BEGIN #undef WRAP_FOR_SPILL_TEST_END diff --git a/dbms/src/Interpreters/Join.cpp b/dbms/src/Interpreters/Join.cpp index 35c0ba8db93..2781a61e0cc 100644 --- a/dbms/src/Interpreters/Join.cpp +++ b/dbms/src/Interpreters/Join.cpp @@ -917,7 +917,7 @@ void Join::handleOtherConditions(Block & block, IColumn::Filter * anti_filter, I mergeNullAndFilterResult(block, filter, non_equal_conditions.other_eq_cond_from_in_name, isAntiJoin(kind)); assert(block_rows == filter.size()); - if (isInnerJoin(kind) || isNecessaryKindToUseRowFlaggedHashMap(kind)) + if (isInnerJoin(kind) || (isNecessaryKindToUseRowFlaggedHashMap(kind) && kind != ASTTableJoin::Kind::Full)) { erase_useless_column(block); /// inner | rightSemi | rightAnti | rightOuter join, just use other_filter_column to filter result @@ -926,6 +926,17 @@ void Join::handleOtherConditions(Block & block, IColumn::Filter * anti_filter, I return; } + PointerHelper::ArrayType * full_join_mapped_entries = nullptr; + if (kind == ASTTableJoin::Kind::Full) + { + RUNTIME_CHECK(!flag_mapped_entry_helper_name.empty()); + auto & mapped_column = block.getByName(flag_mapped_entry_helper_name).column; + auto mutable_mapped_column = (*std::move(mapped_column)).mutate(); + auto & ptr_col = static_cast(*mutable_mapped_column); + full_join_mapped_entries = &ptr_col.getData(); + mapped_column = std::move(mutable_mapped_column); + } + bool is_semi_family = isSemiFamily(kind) || isLeftOuterSemiFamily(kind); for (size_t i = 0, prev_offset = 0; i < offsets_to_replicate->size(); ++i) { @@ -948,8 +959,12 @@ void Join::handleOtherConditions(Block & block, IColumn::Filter * anti_filter, I if (prev_offset < current_offset) { /// for outer join, at least one row must be kept - if (isLeftOuterJoin(kind) && !has_row_kept) + if ((isLeftOuterJoin(kind) || kind == ASTTableJoin::Kind::Full) && !has_row_kept) + { row_filter[prev_offset] = 1; + if (full_join_mapped_entries != nullptr) + (*full_join_mapped_entries)[prev_offset] = 0; + } if (isAntiJoin(kind)) { if (has_row_kept && !(*anti_filter)[i]) @@ -966,9 +981,9 @@ void Join::handleOtherConditions(Block & block, IColumn::Filter * anti_filter, I prev_offset = current_offset; } erase_useless_column(block); - if (isLeftOuterJoin(kind)) + if (isLeftOuterJoin(kind) || kind == ASTTableJoin::Kind::Full) { - /// for left join, convert right column to null if not joined + /// for left/full join, convert right column to null if not joined applyNullToNotMatchedRows(block, right_sample_block, *filter_column); for (size_t i = 0; i < block.columns(); ++i) block.getByPosition(i).column = block.getByPosition(i).column->filter(row_filter, -1); @@ -1283,6 +1298,8 @@ Block Join::doJoinBlockHash(ProbeProcessInfo & probe_process_info, const JoinBui for (size_t i = 0; i < block.rows(); ++i) { auto ptr_value = container[i]; + if (ptr_value == 0) + continue; auto * current = reinterpret_cast(ptr_value); current->setUsed(); } @@ -1292,7 +1309,7 @@ Block Join::doJoinBlockHash(ProbeProcessInfo & probe_process_info, const JoinBui // Return build table header for right semi/anti join block = right_sample_block; } - else if (kind == ASTTableJoin::Kind::RightOuter) + else if (kind == ASTTableJoin::Kind::RightOuter || kind == ASTTableJoin::Kind::Full) { block.erase(flag_mapped_entry_helper_name); } diff --git a/dbms/src/Interpreters/JoinPartition.cpp b/dbms/src/Interpreters/JoinPartition.cpp index b0020287820..fe4048d7765 100644 --- a/dbms/src/Interpreters/JoinPartition.cpp +++ b/dbms/src/Interpreters/JoinPartition.cpp @@ -1453,6 +1453,28 @@ struct RowFlaggedHashMapAdder (*offsets)[i] = current_offset; return false; } + + static bool addNotFoundForFull( + size_t num_columns_to_add, + MutableColumns & added_columns, + size_t i, + IColumn::Offset & current_offset, + IColumn::Offsets * offsets, + ProbeProcessInfo & probe_process_info) + { + assert(num_columns_to_add + 1 == added_columns.size()); + if (current_offset && current_offset + 1 > probe_process_info.max_block_size) + return true; + + ++current_offset; + (*offsets)[i] = current_offset; + for (size_t j = 0; j < num_columns_to_add; ++j) + added_columns[j]->insertDefault(); + + auto & actual_ptr_col = static_cast(*added_columns[num_columns_to_add]); + actual_ptr_col.getData().push_back(0); + return false; + } }; template < @@ -1515,7 +1537,21 @@ void NO_INLINE probeBlockImplTypeCase( { if constexpr (row_flagged_map) { - block_full = RowFlaggedHashMapAdder::addNotFound(i, current_offset, offsets_to_replicate.get()); + if constexpr (KIND == ASTTableJoin::Kind::Full) + { + block_full = RowFlaggedHashMapAdder::addNotFoundForFull( + num_columns_to_add, + added_columns, + i, + current_offset, + offsets_to_replicate.get(), + probe_process_info); + } + else + { + block_full + = RowFlaggedHashMapAdder::addNotFound(i, current_offset, offsets_to_replicate.get()); + } } /// RightSemi/RightAnti without other conditions, just ignore not matched probe rows else if constexpr (KIND != ASTTableJoin::Kind::RightSemi && KIND != ASTTableJoin::Kind::RightAnti) @@ -1616,8 +1652,21 @@ void NO_INLINE probeBlockImplTypeCase( { if constexpr (row_flagged_map) { - block_full - = RowFlaggedHashMapAdder::addNotFound(i, current_offset, offsets_to_replicate.get()); + if constexpr (KIND == ASTTableJoin::Kind::Full) + { + block_full = RowFlaggedHashMapAdder::addNotFoundForFull( + num_columns_to_add, + added_columns, + i, + current_offset, + offsets_to_replicate.get(), + probe_process_info); + } + else + { + block_full + = RowFlaggedHashMapAdder::addNotFound(i, current_offset, offsets_to_replicate.get()); + } } /// RightSemi/RightAnti without other conditions, just ignore not matched probe rows else if constexpr (KIND != ASTTableJoin::Kind::RightSemi && KIND != ASTTableJoin::Kind::RightAnti) @@ -2122,8 +2171,10 @@ void JoinPartition::probeBlock( CALL(Inner, All, MapsAll, false) else if (kind == LeftOuter && strictness == All) CALL(LeftOuter, All, MapsAll, false) - else if (kind == Full && strictness == All) + else if (kind == Full && strictness == All && !use_row_flagged_map) CALL(LeftOuter, All, MapsAllFull, false) + else if (kind == Full && strictness == All && use_row_flagged_map) + CALL(Full, All, MapsAllFullWithRowFlag, true) else if (kind == RightOuter && strictness == All && !use_row_flagged_map) CALL(Inner, All, MapsAllFull, false) else if (kind == RightOuter && strictness == All && use_row_flagged_map) diff --git a/dbms/src/Interpreters/JoinUtils.h b/dbms/src/Interpreters/JoinUtils.h index ba1187f79ec..8a6ba93a285 100644 --- a/dbms/src/Interpreters/JoinUtils.h +++ b/dbms/src/Interpreters/JoinUtils.h @@ -88,7 +88,7 @@ inline bool needScanHashMapAfterProbe(ASTTableJoin::Kind kind) inline bool isNecessaryKindToUseRowFlaggedHashMap(ASTTableJoin::Kind kind) { - return isRightSemiFamily(kind) || kind == ASTTableJoin::Kind::RightOuter; + return isRightSemiFamily(kind) || kind == ASTTableJoin::Kind::RightOuter || kind == ASTTableJoin::Kind::Full; } inline bool useRowFlaggedHashMap(ASTTableJoin::Kind kind, bool has_other_condition) diff --git a/dbms/src/TestUtils/tests/gtest_mock_executors.cpp b/dbms/src/TestUtils/tests/gtest_mock_executors.cpp index ff29392180f..deb2dd8b0a8 100644 --- a/dbms/src/TestUtils/tests/gtest_mock_executors.cpp +++ b/dbms/src/TestUtils/tests/gtest_mock_executors.cpp @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include @@ -42,6 +43,18 @@ class MockDAGRequestTest : public DB::tests::ExecutorTest } }; +namespace +{ +void collectColumnRefs(const tipb::Expr & expr, std::vector & column_refs) +{ + if (expr.tp() == tipb::ExprType::ColumnRef) + column_refs.push_back(&expr); + + for (const auto & child : expr.children()) + collectColumnRefs(child, column_refs); +} +} // namespace + TEST_F(MockDAGRequestTest, MockTable) try { @@ -265,6 +278,75 @@ try } CATCH +TEST_F(MockDAGRequestTest, FullOuterJoinSchemaIsNullable) +try +{ + context.addMockTable( + {"full_outer_test", "l"}, + {{"a", TiDB::TP::TypeLong, false}, {"b", TiDB::TP::TypeLong, false}}); + context.addMockTable( + {"full_outer_test", "r"}, + {{"a", TiDB::TP::TypeLong, false}, {"c", TiDB::TP::TypeLong, false}}); + + auto request = context.scan("full_outer_test", "l") + .join( + context.scan("full_outer_test", "r"), + tipb::JoinType::TypeFullOuterJoin, + {col("a")}, + {}, + {}, + {gt(col("b"), col("c"))}, + {}) + .build(context); + + ASSERT_EQ(request->root_executor().tp(), tipb::ExecType::TypeJoin); + const auto & join = request->root_executor().join(); + ASSERT_EQ(join.other_conditions_size(), 1); + + std::vector column_refs; + collectColumnRefs(join.other_conditions(0), column_refs); + ASSERT_EQ(column_refs.size(), 2); + for (const auto * column_ref : column_refs) + ASSERT_EQ(column_ref->field_type().flag() & TiDB::ColumnFlagNotNull, 0); + + auto output_field_types = collectOutputFieldTypes(*request); + ASSERT_EQ(output_field_types.size(), 4); + for (const auto & field_type : output_field_types) + ASSERT_EQ(field_type.flag() & TiDB::ColumnFlagNotNull, 0); +} +CATCH + +TEST_F(MockDAGRequestTest, SemiJoinColumnPruneKeepsJoinOutputSchema) +try +{ + const std::vector> test_cases = { + {tipb::JoinType::TypeSemiJoin, 3}, + {tipb::JoinType::TypeAntiSemiJoin, 3}, + {tipb::JoinType::TypeLeftOuterSemiJoin, 4}, + {tipb::JoinType::TypeAntiLeftOuterSemiJoin, 4}, + }; + + for (const auto & [join_type, expected_size] : test_cases) + { + auto request = context.scan("test_db", "l_table") + .join(context.scan("test_db", "r_table"), join_type, {col("join_c")}) + .build(context); + + auto output_field_types = collectOutputFieldTypes(*request); + ASSERT_EQ(output_field_types.size(), expected_size) << fmt::underlying(join_type); + ASSERT_EQ(output_field_types[0].tp(), TiDB::TypeLong); + ASSERT_EQ(output_field_types[1].tp(), TiDB::TypeString); + ASSERT_EQ(output_field_types[2].tp(), TiDB::TypeString); + + if (expected_size == 4) + { + ASSERT_EQ(output_field_types[3].tp(), TiDB::TypeTiny); + ASSERT_EQ(output_field_types[3].flag() & TiDB::ColumnFlagNotNull, 0); + } + } +} +CATCH + TEST_F(MockDAGRequestTest, ExchangeSender) try { diff --git a/docs/note/fullouter_join.md b/docs/note/fullouter_join.md new file mode 100644 index 00000000000..d6d87c7f1ee --- /dev/null +++ b/docs/note/fullouter_join.md @@ -0,0 +1,272 @@ +# TiFlash FULL OUTER JOIN Support Change List (Equi Join Only in This Round) + +## Background + +The TiFlash kernel, which inherits the Join architecture from ClickHouse, already has basic support for `ASTTableJoin::Kind::Full`. However, because TiDB has not pushed down full outer join for a long time, this path has not been continuously covered. In particular, the `other condition` logic added later mostly covered left/right outer joins only. + +TiDB is now preparing to support full outer join. After it is pushed down to TiFlash, we need to complete the protocol mapping, output schema, execution semantics, and test coverage together. + +The scope of this round is explicitly limited to `FULL OUTER JOIN` with non-empty `left_join_keys/right_join_keys`, which means the hash join path with equi join keys. + +## Current Conclusions + +### Existing Capabilities That Can Be Reused + +1. The SQL/AST layer already recognizes Full. +- `dbms/src/Parsers/ParserTablesInSelectQuery.cpp:117` +- `dbms/src/Parsers/ASTTablesInSelectQuery.cpp:165` + +2. In the hash join framework, `Full` is already treated as a join that needs to scan unmatched build-side rows after probing. +- `dbms/src/Interpreters/JoinUtils.h:26` +- `dbms/src/Interpreters/JoinUtils.h:84` + +3. The basic full path without `other condition` mostly exists. +- Nullable handling for probe-side columns: `dbms/src/Interpreters/ProbeProcessInfo.cpp:71` +- Nullable handling for build-side sample columns: `dbms/src/Interpreters/Join.cpp:346` +- Nullable handling for build blocks: `dbms/src/Interpreters/Join.cpp:696` +- Full uses `MapsAllFull` during probing: `dbms/src/Interpreters/JoinPartition.cpp:2124` + +### Main Gaps That Must Be Filled + +1. The TiDB DAG protocol and TiFlash JoinType mapping do not have full yet. +- `contrib/tipb/proto/executor.proto:184` +- `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp:68` + +2. The nullable rules for Full output/intermediate schemas are incomplete. They currently only handle left/right outer joins. +- `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp:298` +- `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp:333` +- `dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp:175` + +3. Correctness for `full + other condition` is currently broken. +- `handleOtherConditions` has no full branch and may directly throw a logical error: `dbms/src/Interpreters/Join.cpp:1032` +- Full currently uses `MapsAllFull`. When a key is hit, it calls `setUsed()` too early. If `other condition` filters the row out later, TiFlash cannot restore the "unmatched right row" state: `dbms/src/Interpreters/JoinPartition.cpp:1601` + +4. The current left/right condition validation does not allow full. +- `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.h:97` + +5. Full with no equi key, where join keys are empty, is out of scope for this round. Cartesian full join will not be enabled in this round. + +## Detailed Changes + +## 1. Protocol and JoinType Mapping + +1. Add `TypeFullOuterJoin` to tipb, synchronized with the final protocol value from TiDB. +- Location: `contrib/tipb/proto/executor.proto` + +2. Add the full mapping in `JoinInterpreterHelper::getJoinKindAndBuildSideIndex`. +- Equal join with keys should at least support: + - `{TypeFullOuterJoin, 0} -> {ASTTableJoin::Kind::Full, 0}` + - `{TypeFullOuterJoin, 1} -> {ASTTableJoin::Kind::Full, 1}` +- Convention, consistent with TiDB tests: the build side of `FULL OUTER JOIN` is specified directly by `inner_idx`, that is, `build_side_index == inner_idx`. +- Note: the full case does not need to adjust `join kind` like left/right outer join does. The execution layer will still wire the probe/build roles according to `build_side_index` to satisfy TiFlash's internal right-build convention. + +3. Add full to stringification/logging branches to avoid falling into the default error path. +- `dbms/src/Flash/Coprocessor/DAGUtils.cpp:837` + +## 2. Nullable/Schema Rules for Full + +1. Join output schema: both left-side and right-side columns must be nullable for full. +- `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp:333` + +2. Input schema for compiling `other condition`: for full, both sides, plus the extra columns added during probe preparation, must follow full semantics and become nullable as needed. +- `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp:298` +- `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp:313` + +3. `collectOutputFieldTypes` must also make both sides nullable for externally visible field types. +- `dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp:175` +- `dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp:213` + +## 3. Validation for Left/Right Conditions Under Full + +`JoinNonEqualConditions::validate` currently limits left condition to left outer join and right condition to right outer join. Under full, both kinds of conditions may appear and should be allowed. + +- Location: `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.h:95` + +## 4. Core Correctness: Full + Other Condition + +This is the most important change in this round. + +Current problem: + +1. During probing, full uses `MapsAllFull` and calls `setUsed()` as soon as the key is hit. +2. `other condition` may filter out those rows afterward. +3. However, the build-side "used" mark has already been set, so the post-probe scan will no longer output those records, even though they should be output as "unmatched right rows". + +The required behavior is: `used` can only be set after `other condition` passes. + +Implementation-wise, the following items 1-6 can conceptually be split into "mark timing", "`handleOtherConditions` semantics", and "post-probe scan integration". However, the current code paths are tightly coupled: + +1. If we only switch to row-flagged maps without adding a full branch in `handleOtherConditions`, `full + other condition` may still hit a runtime error branch. +2. If we only switch the probe-side map without updating the post-probe scan branch, the scan phase will still read the wrong hash map type. + +Therefore, the following items 1-6 should at least be delivered as one review/commit unit. Steps 6/7/8 later in this document remain as conceptual decomposition for easier verification, but they should not be mechanically split into three independent patches that can be merged separately. + +Suggested implementation direction: + +1. Make full use row-flagged maps (`MapsAllFullWithRowFlag`) when `has_other_condition=true`. +- Affected locations: + - `dbms/src/Interpreters/JoinUtils.h:89` + - `dbms/src/Interpreters/JoinPartition.cpp:281` + - `dbms/src/Interpreters/JoinPartition.cpp:971` + - `dbms/src/Interpreters/JoinPartition.cpp:1037` + +2. Add a full + row_flagged dispatch branch in `JoinPartition::probeBlock`. +- Full currently always uses `MapsAllFull`: `dbms/src/Interpreters/JoinPartition.cpp:2124` + +3. The row-flagged probe adder needs to support the "fallback left output" semantics of full. +- Currently `RowFlaggedHashMapAdder::addNotFound` does not append a default row: `dbms/src/Interpreters/JoinPartition.cpp:1450` +- For full, when not-found happens, it still needs to output one row with right-side defaults and write nullable values into the helper column. + +4. Add a full branch in `Join::handleOtherConditions` with semantics aligned to the fallback behavior of left outer join. +- Currently the "keep at least one row + set right side to null" logic only runs when `isLeftOuterJoin(kind)`: + - `dbms/src/Interpreters/Join.cpp:999` + - `dbms/src/Interpreters/Join.cpp:1017` + +5. In `Join::doJoinBlockHash` for full+row_flagged: +- When marking build rows as used according to helper pointers, skip null pointers. +- Remove helper temporary columns before outputting the result. +- Related location: `dbms/src/Interpreters/Join.cpp:1325` + +6. Switch the post-probe scan phase to the row_flagged branch for full+other_condition. +- `dbms/src/DataStreams/ScanHashMapAfterProbeBlockInputStream.cpp:256` + +## 5. Full Without Equi Keys (Cartesian Full) Is Out of Scope + +The current code does not have a complete execution path for full without keys, and this round does not enable that path. It is recommended to add an explicit guard to avoid accidentally taking an invalid path: + +1. If TiFlash receives `TypeFullOuterJoin` with empty join keys, explicitly return `Unimplemented/BadRequest`, and make the error message clear that "cartesian full is not supported". +2. If support is needed later, implement the cross full path as a separate task. The workload is clearly larger than equal full. + +It is recommended to reject this case in `getJoinKindAndBuildSideIndex` or further upstream, instead of letting it fall into unclear errors such as "Unknown join type". + +## 6. Debug/Mock and Test Construction Path + +To make gtests able to construct full join DAGs, the mock binder also needs to be updated. + +1. Add full to mock schema nullable rules, making both sides nullable. +- `dbms/src/Debug/MockExecutor/JoinBinder.cpp:99` +- `dbms/src/Debug/MockExecutor/JoinBinder.cpp:296` + +2. Add full to the old AST -> tipb test compilation path. +- `dbms/src/Debug/MockExecutor/JoinBinder.cpp:384` + +## 7. Test Change Suggestions + +Minimum recommended coverage: + +1. Coprocessor mapping tests +- `dbms/src/Flash/Coprocessor/tests/gtest_join_get_kind_and_build_index.cpp` +- Add full + `inner_idx=0/1` with keys. +- Explicitly assert `build_side_index == inner_idx` for full. +- Optionally add one no-key rejection case to verify the error message, without implementing the execution logic. + +2. Join Executor correctness +- Refer to the existing construction pattern for `RightOuterJoin`: `dbms/src/Flash/tests/gtest_join_executor.cpp:4031` +- Focus on adding full cases: + - With keys, without other condition + - With keys, with other condition + - Key hit but all rows fail `other condition` (must output both left-unmatched and right-unmatched rows) + - With left/right conditions + - With null keys + +3. Spill / fine-grained shuffle +- Add at least one full + other condition spill case. +- Existing join type arrays still contain only seven join types: + - `dbms/src/Flash/tests/gtest_join.h:184` + - `dbms/src/Flash/tests/gtest_compute_server.cpp:1248` +- It is not recommended to expand all full arrays to eight join types and rewrite many expected results immediately. Add targeted full cases first. + +## Suggested Implementation Order + +1. Sync tipb and get `TypeFullOuterJoin`. +2. Enable JoinType mapping, schema nullable handling, and `getJoinTypeName`. +3. Allow left/right condition validation for full. +4. Implement the main correctness path for `full + other condition` together: row-flagged path, `handleOtherConditions`, and post-probe scan integration. +5. Add an explicit rejection for full without equi keys, which is not implemented in this round. +6. Add gtests, starting with targeted cases before considering expanding the join type matrix. + +## Development Steps (Executable Checklist) + +1. Step 1: enable the protocol enum, only full and only with equi keys +- Goal: TiFlash can recognize `TypeFullOuterJoin`. +- Changes: + - Add `TypeFullOuterJoin` to `contrib/tipb/proto/executor.proto`. + - Generate/sync the corresponding protobuf code according to the repository's existing workflow. +- Acceptance: + - The code compiles without `JoinType` enum errors. + - `TypeFullOuterJoin` can be referenced in TiFlash code. + +2. Step 2: JoinType mapping and build-side convention +- Goal: map full to `ASTTableJoin::Kind::Full`, with `build_side_index == inner_idx`. +- Changes: + - `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp` + - `dbms/src/Flash/Coprocessor/DAGUtils.cpp` (`getJoinTypeName`) + - `dbms/src/Flash/Coprocessor/tests/gtest_join_get_kind_and_build_index.cpp`, if needed +- Acceptance: + - full + `inner_idx=0/1` both return `kind=Full`, and `build_side_index` is equal to `inner_idx`. + +3. Step 3: equi-key scope guard, explicitly reject no-key full +- Goal: cartesian full is not implemented in this round, and TiFlash returns a clear error for this request. +- Changes: + - Prefer adding the guard in `JoinInterpreterHelper::getJoinKindAndBuildSideIndex(...)` or further upstream. +- Acceptance: + - When `TypeFullOuterJoin` has `join_keys_size==0`, the error message clearly says cartesian full is not supported. + +4. Step 4: make output and other-condition input schemas fully nullable for full +- Goal: under full, both left-side and right-side output columns, and the columns used to compile other-condition, are treated as nullable. +- Changes: + - `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.cpp` + - `dbms/src/Flash/Coprocessor/collectOutputFieldTypes.cpp` + - `dbms/src/Debug/MockExecutor/JoinBinder.cpp` for the test construction path +- Acceptance: + - In full output schema, columns from both sides are nullable, without affecting the semi join family. + +5. Step 5: allow left/right condition validation for full +- Goal: full can carry left/right conditions and will not be rejected by `validate`. +- Changes: + - `dbms/src/Flash/Coprocessor/JoinInterpreterHelper.h` (`JoinNonEqualConditions::validate`) +- Acceptance: + - Constructing full + left_condition and full + right_condition does not report "non left/right join with ... conditions". + +6. Step 6: core change A - used mark timing for full + other condition (recommended to deliver together with steps 7 and 8) +- Goal: mark build rows as used only after `other condition` passes, so unmatched right rows are not lost. +- Notes: + - In the current code, row-flagged map selection, full semantics in `handleOtherConditions`, and the post-probe scan branch are tightly coupled. + - If only this step is applied without steps 7 and 8, the code may partially compile, but `full + other condition` still cannot form a runnable and verifiable complete path. +- Changes: + - `dbms/src/Interpreters/JoinUtils.h` to make full+other_condition enter the row-flagged path + - `dbms/src/Interpreters/JoinPartition.cpp` for map initialization, probe dispatch, and adder not-found behavior + - `dbms/src/Interpreters/Join.cpp` to skip null pointers when marking used rows in `doJoinBlockHash` +- Acceptance: + - In the case where keys hit but all rows fail `other condition`, right-side rows can still be output during the post-probe scan. + +7. Step 7: core change B - full semantics in `handleOtherConditions` (usually delivered together with step 6) +- Goal: full also needs the correct "outer join keeps at least one row + set right side to null" behavior. +- Changes: + - `dbms/src/Interpreters/Join.cpp` (`handleOtherConditions` branch) +- Acceptance: + - Results of full + other_condition are consistent with the symmetric combination semantics of left/right outer join. + +8. Step 8: integrate the post-probe scan phase (usually delivered together with step 6) +- Goal: full+other_condition uses the row-flagged map to scan unmatched right rows. +- Changes: + - `dbms/src/DataStreams/ScanHashMapAfterProbeBlockInputStream.cpp` +- Acceptance: + - full+other_condition can correctly output unmatched build-side rows without missing or duplicating rows. + +9. Step 9: add tests, starting with targeted cases +- Goal: guarantee correctness first, then expand matrices if needed. +- Suggested changes: + - `dbms/src/Flash/Coprocessor/tests/gtest_join_get_kind_and_build_index.cpp` + - `dbms/src/Flash/tests/gtest_join_executor.cpp` + - `dbms/src/Flash/tests/gtest_spill_join.cpp` (at least one case) +- Minimum acceptance set: + - full + key + no other_condition + - full + key + with other_condition + - full + key hit but all rows fail other_condition (critical regression) + - full + left/right conditions + - full + null key + +## One-Line Risk Summary + +If we only implement "JoinType mapping + nullable output" without changing the row-flagged logic, `FULL OUTER JOIN ... ON eq_key AND other_condition` will miss right-side rows. This is a result-correctness bug, not a performance issue, and must be completed together with protocol enablement.