Skip to content

Commit

Permalink
[opt](function) optimize from_unixtime/date_format by specially forma…
Browse files Browse the repository at this point in the history
…t str (#40821)

```
 mysql [test]>select count(date_format(a, 'yyyyMMdd')) from date_format_tmp;
+-----------------------------------+
| count(date_format(a, 'yyyyMMdd')) |
+-----------------------------------+
|                          16000000 |
+-----------------------------------+
1 row in set (0.53 sec)


mysql [test]>select count(date_format(a, 'yyyyMMdd')) from date_format_tmp;
+-----------------------------------+
| count(date_format(a, 'yyyyMMdd')) |
+-----------------------------------+
|                          16000000 |
+-----------------------------------+
1 row in set (0.28 sec)
```
  • Loading branch information
Mryange authored Sep 18, 2024
1 parent 1501597 commit 32d4b08
Show file tree
Hide file tree
Showing 5 changed files with 357 additions and 67 deletions.
156 changes: 156 additions & 0 deletions be/src/vec/functions/date_format_type.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <variant>

#include "vec/common/string_ref.h"

namespace doris::vectorized::time_format_type {
// Used to optimize commonly used date formats.

inline StringRef rewrite_specific_format(const char* raw_str, size_t str_size) {
const static std::string specific_format_strs[3] = {"%Y%m%d", "%Y-%m-%d", "%Y-%m-%d %H:%i:%s"};
const static std::string specific_format_rewrite[3] = {"yyyyMMdd", "yyyy-MM-dd",
"yyyy-MM-dd HH:mm:ss"};
for (int i = 0; i < 3; i++) {
const StringRef specific_format {specific_format_strs[i].data(),
specific_format_strs[i].size()};
if (specific_format == StringRef {raw_str, str_size}) {
return {specific_format_rewrite[i].data(), specific_format_rewrite[i].size()};
}
}
return {raw_str, str_size};
}

template <typename T>
void put_year(T y, char* buf, int& i) {
int t = y / 100;
buf[i++] = t / 10 + '0';
buf[i++] = t % 10 + '0';

t = y % 100;
buf[i++] = t / 10 + '0';
buf[i++] = t % 10 + '0';
}

template <typename T>
void put_other(T m, char* buf, int& i) {
buf[i++] = m / 10 + '0';
buf[i++] = m % 10 + '0';
}

// NoneImpl indicates that no specific optimization has been applied, and the general logic is used for processing.
struct NoneImpl {};

struct yyyyMMddImpl {
template <typename DateType>
size_t static date_to_str(const DateType& date_value, char* buf) {
int i = 0;
put_year(date_value.year(), buf, i);
put_other(date_value.month(), buf, i);
put_other(date_value.day(), buf, i);
return i;
}
};

struct yyyy_MM_ddImpl {
template <typename DateType>
size_t static date_to_str(const DateType& date_value, char* buf) {
int i = 0;
put_year(date_value.year(), buf, i);
buf[i++] = '-';
put_other(date_value.month(), buf, i);
buf[i++] = '-';
put_other(date_value.day(), buf, i);
return i;
}
};

struct yyyy_MM_dd_HH_mm_ssImpl {
template <typename DateType>
size_t static date_to_str(const DateType& date_value, char* buf) {
int i = 0;
put_year(date_value.year(), buf, i);
buf[i++] = '-';
put_other(date_value.month(), buf, i);
buf[i++] = '-';
put_other(date_value.day(), buf, i);
buf[i++] = ' ';
put_other(date_value.hour(), buf, i);
buf[i++] = ':';
put_other(date_value.minute(), buf, i);
buf[i++] = ':';
put_other(date_value.second(), buf, i);
return i;
}
};

struct yyyy_MMImpl {
template <typename DateType>
size_t static date_to_str(const DateType& date_value, char* buf) {
int i = 0;
put_year(date_value.year(), buf, i);
buf[i++] = '-';
put_other(date_value.month(), buf, i);
return i;
}
};
struct yyyyMMImpl {
template <typename DateType>
size_t static date_to_str(const DateType& date_value, char* buf) {
int i = 0;
put_year(date_value.year(), buf, i);
put_other(date_value.month(), buf, i);
return i;
}
};

struct yyyyImpl {
template <typename DateType>
size_t static date_to_str(const DateType& date_value, char* buf) {
int i = 0;
put_year(date_value.year(), buf, i);
return i;
}
};

using FormatImplVariant = std::variant<NoneImpl, yyyyMMddImpl, yyyy_MM_ddImpl,
yyyy_MM_dd_HH_mm_ssImpl, yyyy_MMImpl, yyyyMMImpl, yyyyImpl>;

const static std::string default_format = "yyyy-MM-dd HH:mm:ss";
const static auto default_impl = yyyy_MM_dd_HH_mm_ssImpl {};
inline FormatImplVariant string_to_impl(const std::string& format) {
if (format == "yyyyMMdd" || format == "%Y%m%d") {
return yyyyMMddImpl {};
} else if (format == "yyyy-MM-dd" || format == "%Y-%m-%d") {
return yyyy_MM_ddImpl {};
} else if (format == "yyyy-MM-dd HH:mm:ss" || format == "%Y-%m-%d %H:%i:%s") {
return yyyy_MM_dd_HH_mm_ssImpl {};
} else if (format == "yyyy-MM") {
return yyyy_MMImpl {};
} else if (format == "yyyyMM") {
return yyyyMMImpl {};
} else if (format == "yyyy") {
return yyyyImpl {};
} else {
return NoneImpl {};
}
}

} // namespace doris::vectorized::time_format_type
104 changes: 68 additions & 36 deletions be/src/vec/functions/date_time_transforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "vec/core/types.h"
#include "vec/data_types/data_type_date_time.h"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/date_format_type.h"
#include "vec/runtime/vdatetime_value.h"
#include "vec/utils/util.hpp"

Expand Down Expand Up @@ -184,34 +185,44 @@ struct DateFormatImpl {

static constexpr auto name = "date_format";

static inline auto execute(const FromType& t, StringRef format, ColumnString::Chars& res_data,
size_t& offset) {
const auto& dt = (DateType&)t;
if (format.size > 128) {
return std::pair {offset, true};
}
char buf[100 + SAFE_FORMAT_STRING_MARGIN];
if (!dt.to_format_string_conservative(format.data, format.size, buf,
100 + SAFE_FORMAT_STRING_MARGIN)) {
return std::pair {offset, true};
}
template <typename Impl>
static inline bool execute(const FromType& t, StringRef format, ColumnString::Chars& res_data,
size_t& offset, const cctz::time_zone& time_zone) {
if constexpr (std::is_same_v<Impl, time_format_type::NoneImpl>) {
// Handle non-special formats.
const auto& dt = (DateType&)t;
char buf[100 + SAFE_FORMAT_STRING_MARGIN];
if (!dt.to_format_string_conservative(format.data, format.size, buf,
100 + SAFE_FORMAT_STRING_MARGIN)) {
return true;
}

auto len = strlen(buf);
res_data.insert(buf, buf + len);
offset += len;
return false;
} else {
const auto& dt = (DateType&)t;

auto len = strlen(buf);
res_data.insert(buf, buf + len);
offset += len;
return std::pair {offset, false};
if (!dt.is_valid_date()) {
return true;
}

// No buffer is needed here because these specially optimized formats have fixed lengths,
// and sufficient memory has already been reserved.
auto len = Impl::date_to_str(dt, (char*)res_data.data() + offset);
offset += len;

return false;
}
}

static DataTypes get_variadic_argument_types() {
return std::vector<DataTypePtr> {
std::dynamic_pointer_cast<const IDataType>(
std::make_shared<typename DateTraits<ArgType>::DateType>()),
std::dynamic_pointer_cast<const IDataType>(
std::make_shared<vectorized::DataTypeString>())};
return std::vector<DataTypePtr> {std::make_shared<typename DateTraits<ArgType>::DateType>(),
std::make_shared<vectorized::DataTypeString>()};
}
};

// TODO: This function should be depend on arguments not always nullable
template <typename DateType>
struct FromUnixTimeImpl {
using FromType = Int64;
Expand All @@ -220,24 +231,45 @@ struct FromUnixTimeImpl {
static const int64_t TIMESTAMP_VALID_MAX = 32536771199;
static constexpr auto name = "from_unixtime";

static inline auto execute(FromType val, StringRef format, ColumnString::Chars& res_data,
template <typename Impl>
static inline bool execute(const FromType& val, StringRef format, ColumnString::Chars& res_data,
size_t& offset, const cctz::time_zone& time_zone) {
DateType dt;
if (format.size > 128 || val < 0 || val > TIMESTAMP_VALID_MAX) {
return std::pair {offset, true};
}
dt.from_unixtime(val, time_zone);
if constexpr (std::is_same_v<Impl, time_format_type::NoneImpl>) {
DateType dt;
if (val < 0 || val > TIMESTAMP_VALID_MAX) {
return true;
}
dt.from_unixtime(val, time_zone);

char buf[100 + SAFE_FORMAT_STRING_MARGIN];
if (!dt.to_format_string_conservative(format.data, format.size, buf,
100 + SAFE_FORMAT_STRING_MARGIN)) {
return std::pair {offset, true};
}
char buf[100 + SAFE_FORMAT_STRING_MARGIN];
if (!dt.to_format_string_conservative(format.data, format.size, buf,
100 + SAFE_FORMAT_STRING_MARGIN)) {
return true;
}

auto len = strlen(buf);
res_data.insert(buf, buf + len);
offset += len;
return std::pair {offset, false};
auto len = strlen(buf);
res_data.insert(buf, buf + len);
offset += len;
return false;

} else {
DateType dt;
if (val < 0 || val > TIMESTAMP_VALID_MAX) {
return true;
}
dt.from_unixtime(val, time_zone);

if (!dt.is_valid_date()) {
return true;
}

// No buffer is needed here because these specially optimized formats have fixed lengths,
// and sufficient memory has already been reserved.
auto len = Impl::date_to_str(dt, (char*)res_data.data() + offset);
offset += len;

return false;
}
}
};

Expand Down
Loading

0 comments on commit 32d4b08

Please sign in to comment.