diff --git a/bolt/core/QueryConfig.h b/bolt/core/QueryConfig.h index db5566da4..3ffdfdbb1 100644 --- a/bolt/core/QueryConfig.h +++ b/bolt/core/QueryConfig.h @@ -744,6 +744,10 @@ class QueryConfig { static constexpr const char* kEnableSonicJsonToMap = "sonic.json_to_map"; + /// Whether json_to_map escapes raw control chars and retries parse. + static constexpr const char* kJsonToMapEscapeControlChars = + "json_to_map_escape_control_chars"; + static constexpr const char* kEnableSonicIsJsonScalar = "sonic.is_json_scalar"; @@ -1683,6 +1687,10 @@ class QueryConfig { return get(kEnableSonicJsonToMap, true); } + bool jsonToMapEscapeControlChars() const { + return get(kJsonToMapEscapeControlChars, true); + } + bool enableSonicIsJsonScalar() const { return get(kEnableSonicIsJsonScalar, true); } diff --git a/bolt/functions/sparksql/JsonToMap.cpp b/bolt/functions/sparksql/JsonToMap.cpp index 9a6e5f85f..706be7727 100644 --- a/bolt/functions/sparksql/JsonToMap.cpp +++ b/bolt/functions/sparksql/JsonToMap.cpp @@ -68,15 +68,17 @@ class JsonToMapFunction : public exec::VectorFunction { const TypePtr& outputType, exec::EvalCtx& context, VectorPtr& result) const override { - folly::call_once(initUseSonic_, [&] { - useSonic_ = - context.execCtx()->queryCtx()->queryConfig().enableSonicJsonParse(); - }); + const auto& queryConfig = context.execCtx()->queryCtx()->queryConfig(); + folly::call_once( + initUseSonic_, [&] { useSonic_ = queryConfig.enableSonicJsonParse(); }); + + const auto escapeControlChars = queryConfig.jsonToMapEscapeControlChars(); if (useSonic_) { - applySonic(rows, args, outputType, context, result); + applySonic(rows, args, outputType, escapeControlChars, context, result); } else { - applySimdJson(rows, args, outputType, context, result); + applySimdJson( + rows, args, outputType, escapeControlChars, context, result); } } @@ -84,6 +86,7 @@ class JsonToMapFunction : public exec::VectorFunction { const SelectivityVector& rows, std::vector& args, const TypePtr& outputType, + bool escapeControlChars, exec::EvalCtx& context, VectorPtr& result) const { BaseVector::ensureWritable( @@ -141,7 +144,7 @@ class JsonToMapFunction : public exec::VectorFunction { padded_data = current; bool ok = parseInto(padded_data); - if (!ok) { + if (!ok && escapeControlChars) { // On failure, escape raw control chars and retry (only the rare // failure path; valid JSON is unaffected). padded_data = escapeUnescapedControlChars(current); @@ -166,6 +169,7 @@ class JsonToMapFunction : public exec::VectorFunction { const SelectivityVector& rows, std::vector& args, const TypePtr& outputType, + bool escapeControlChars, exec::EvalCtx& context, VectorPtr& result) const { BaseVector::ensureWritable( @@ -191,7 +195,7 @@ class JsonToMapFunction : public exec::VectorFunction { kParseIntegerAsRaw | kParseOverflowNumAsNumStr; doc.Parse(current); std::string escaped; - if (doc.HasParseError()) { + if (doc.HasParseError() && escapeControlChars) { // On failure, escape raw control chars and retry (matching jsoniter); // `escaped` must outlive the member iteration below. escaped = escapeUnescapedControlChars(current); diff --git a/bolt/functions/sparksql/tests/JsonToMapTest.cpp b/bolt/functions/sparksql/tests/JsonToMapTest.cpp index 7bf0ab084..766e6d459 100644 --- a/bolt/functions/sparksql/tests/JsonToMapTest.cpp +++ b/bolt/functions/sparksql/tests/JsonToMapTest.cpp @@ -27,6 +27,13 @@ class JsonToMapTest : public SparkFunctionBaseTest { expr, makeRowVector({makeFlatVector({inputs[0]})})); } + void setJsonToMapEscapeControlChars(bool escapeControlChars) { + auto config = queryCtx_->queryConfig().rawConfigsCopy(); + config[core::QueryConfig::kJsonToMapEscapeControlChars] = + std::to_string(escapeControlChars); + queryCtx_->testingOverrideConfigUnsafe(std::move(config)); + } + void testJsonToMap( const std::vector& inputs, const std::vector>>& @@ -147,6 +154,7 @@ TEST_F(JsonToMapTest, unescapedControlChars) { // escaped, but the reference Hive UDF (backed by com.jsoniter) accepts raw // control chars and keeps them verbatim in the values. json_to_map must // match that lenient behavior instead of returning SQL NULL. + setJsonToMapEscapeControlChars(true); { // Raw newlines embedded in several values; they must be preserved. StringView json = @@ -178,6 +186,17 @@ TEST_F(JsonToMapTest, unescapedControlChars) { } } +TEST_F(JsonToMapTest, unescapedControlCharsRetryConfig) { + const StringView json = StringView("{\"k\":\"a\nb\"}"); + setJsonToMapEscapeControlChars(false); + auto result = evaluateJsonToMap({json}); + auto expected = makeNullableMapVector({std::nullopt}); + assertEqualVectors(expected, result); + + setJsonToMapEscapeControlChars(true); + testJsonToMap({json}, {{"k", StringView("a\nb")}}); +} + TEST_F(JsonToMapTest, numericValuesPreserveOriginalText) { // Numeric values must be returned as their original JSON token, matching the // Hive reference UDF (com.jsoniter). They must NOT be round-tripped through a