遵循my original question,并考虑到一些建议的解决方案,我为C++14提出了如下解决方案:
#include <algorithm>
#include <exception>
#include <iterator>
#include <cstddef>
template<class It, class Func>
auto binary_fold(It begin, It end, Func op) -> decltype(op(*begin, *end)) {
std::ptrdiff_t diff = end - begin;
switch (diff) {
case 0: throw std::out_of_range("binary fold on empty container");
case 1: return *begin;
case 2: return op(*begin, *(begin + 1));
default: { // first round to the nearest multiple of 2 and then advance
It mid{begin};
int div = diff/2;
int offset = (div%2 == 1) ? (div+1) : div; // round to the closest multiple of two (upwards)
std::advance(mid, offset);
return op( binary_fold(begin,mid,op), binary_fold(mid,end,op) );
}
}
}该算法将以递归的方式执行二进制运算,直到得到结果。例如。
std::vector<int> v = {1,3,5,6,1};
auto result = mar::binary_fold(v.cbegin(), v.cend(), std::minus<int>());将解决以下问题:
1 - (5-6) - (1-3) = 0在某些情况下(如上文所述),算法将是左关联,但在其他情况下(如以下所示),它将是右关联:
std::vector<int> v = {7,4,9,2,6,8};
auto result = mar::binary_fold(v.cbegin(), v.cend(), std::minus<int>());在以下方面的成果:
(7-4) - (9-2) - (6-8) = -2我想知道如何进一步优化这个算法,以便:
答:肯定是左或右联想
它尽可能快(这将放在一个openGL绘图循环中,所以它必须非常快)。
c.制作一个TMP版本,当容器的大小已知时,它将在编译时计算偏移量(这对于我的应用程序来说不是必要的,但我只是好奇它是如何实现的)。
我对b的第一个想法是,迭代版本可能会更快,偏移量计算可以进一步优化(也许使用一些位魔术?)。不过,我还是被困住了。
发布于 2016-02-16 22:40:55
我有两个TMP版本。哪一种更好,取决于数据类型,我猜:
解A:
首先,让我们为分裂点找到一个很好的抵消(两种力量的力量似乎很大):
template<std::ptrdiff_t diff, std::ptrdiff_t V = 2>
struct offset
{
static constexpr std::ptrdiff_t value =
(V * 2 < diff - 1) ? offset<diff, V * 2>::value : V;
};
// End recursion
template<std::ptrdiff_t diff>
struct offset<diff, 1<<16>
{
static constexpr std::ptrdiff_t value = 1<<16;
};
// Some special cases
template<>
struct offset<0, 2>
{
static constexpr std::ptrdiff_t value = 0;
};
template<>
struct offset<1, 2>
{
static constexpr std::ptrdiff_t value = 0;
};
template<>
struct offset<2, 2>
{
static constexpr std::ptrdiff_t value = 0;
};这样,我们可以创建一个递归的TMP版本:
template <std::ptrdiff_t diff, class It, class Func>
auto binary_fold_tmp(It begin, It end, Func op)
-> decltype(op(*begin, *end))
{
assert(end - begin == diff);
switch (diff)
{
case 0:
assert(false);
return 0; // This will never happen
case 1:
return *begin;
case 2:
return op(*begin, *(begin + 1));
default:
{ // first round to the nearest multiple of 2 and then advance
It mid{begin};
std::advance(mid, offset<diff>::value);
auto left = binary_fold_tmp<offset<diff>::value>(begin, mid, op);
auto right =
binary_fold_tmp<diff - offset<diff>::value>(mid, end, op);
return op(left, right);
}
}
}例如,这可以与非TMP版本相结合:
template <class It, class Func>
auto binary_fold(It begin, It end, Func op)
-> decltype(op(*begin, *end))
{
const auto diff = end - begin;
assert(diff > 0);
switch (diff)
{
case 1:
return binary_fold_tmp<1>(begin, end, op);
case 2:
return binary_fold_tmp<2>(begin, end, op);
case 3:
return binary_fold_tmp<3>(begin, end, op);
case 4:
return binary_fold_tmp<4>(begin, end, op);
case 5:
return binary_fold_tmp<5>(begin, end, op);
case 6:
return binary_fold_tmp<6>(begin, end, op);
case 7:
return binary_fold_tmp<7>(begin, end, op);
case 8:
return binary_fold_tmp<8>(begin, end, op);
default:
if (diff < 16)
return op(binary_fold_tmp<8>(begin, begin + 8, op),
binary_fold(begin + 8, end, op));
else if (diff < 32)
return op(binary_fold_tmp<16>(begin, begin + 16, op),
binary_fold(begin + 16, end, op));
else
return op(binary_fold_tmp<32>(begin, begin + 32, op),
binary_fold(begin + 32, end, op));
}
}解决方案B:
这将计算成对的结果,将它们存储在缓冲区中,然后使用缓冲区调用自己:
template <std::ptrdiff_t diff, class It, class Func, size_t... Is>
auto binary_fold_pairs_impl(It begin,
It end,
Func op,
const std::index_sequence<Is...>&)
-> decltype(op(*begin, *end))
{
std::decay_t<decltype(*begin)> pairs[diff / 2] = {
op(*(begin + 2 * Is), *(begin + 2 * Is + 1))...};
if (diff == 2)
return pairs[0];
else
return binary_fold_pairs_impl<diff / 2>(
&pairs[0],
&pairs[0] + diff / 2,
op,
std::make_index_sequence<diff / 4>{});
}
template <std::ptrdiff_t diff, class It, class Func>
auto binary_fold_pairs(It begin, It end, Func op) -> decltype(op(*begin, *end))
{
return binary_fold_pairs_impl<diff>(
begin, end, op, std::make_index_sequence<diff / 2>{});
}这个模板函数要求diff是2的幂。但是当然,您也可以将它与非模板版本结合在一起:
template <class It, class Func>
auto binary_fold_mix(It begin, It end, Func op) -> decltype(op(*begin, *end))
{
const auto diff = end - begin;
assert(diff > 0);
switch (diff)
{
case 1:
return *begin;
case 2:
return binary_fold_pairs<2>(begin, end, op);
case 3:
return op(binary_fold_pairs<2>(begin, begin + 1, op),
*(begin + (diff - 1)));
case 4:
return binary_fold_pairs<4>(begin, end, op);
case 5:
return op(binary_fold_pairs<4>(begin, begin + 4, op),
*(begin + (diff - 1)));
case 6:
return op(binary_fold_pairs<4>(begin, begin + 4, op),
binary_fold_pairs<4>(begin + 4, begin + 6, op));
case 7:
return op(binary_fold_pairs<4>(begin, begin + 4, op),
binary_fold_mix(begin + 4, begin + 7, op));
case 8:
return binary_fold_pairs<8>(begin, end, op);
default:
if (diff <= 16)
return op(binary_fold_pairs<8>(begin, begin + 8, op),
binary_fold_mix(begin + 8, end, op));
else if (diff <= 32)
return op(binary_fold_pairs<16>(begin, begin + 16, op),
binary_fold_mix(begin + 16, end, op));
else
return op(binary_fold_pairs<32>(begin, begin + 32, op),
binary_fold_mix(begin + 32, end, op));
}
}我用与MtRoad相同的程序进行测量。在我的机器上,差异没有MtRoad报告的那么大。使用-O3解决方案A和B似乎略快于MtRoad的版本,但实际上,您需要使用您的类型和数据进行测试。
备注:我没有太严格地测试我的版本。
发布于 2016-02-16 05:09:46
我写了一个“总是左关联”的迭代版本,你也可以使用一些定时运行。在打开编译器优化之前,它的性能会稍微差一些。
10000次迭代的总运行时间为5000次。
g++ --std=c++11 main.cpp && ./a.out
Recursive elapsed:9642msec
Iterative elapsed:10189msec
$ g++ --std=c++11 -O1 main.cpp && ./a.out
Recursive elapsed:3468msec
Iterative elapsed:3098msec
Iterative elapsed:3359msec # another run
Recursive elapsed:3668msec
$ g++ --std=c++11 -O2 main.cpp && ./a.out
Recursive elapsed:3193msec
Iterative elapsed:2763msec
Recursive elapsed:3184msec # another run
Iterative elapsed:2696msec
$ g++ --std=c++11 -O3 main.cpp && ./a.out
Recursive elapsed:3183msec
Iterative elapsed:2685msec
Recursive elapsed:3106msec # another run
Iterative elapsed:2681msec
Recursive elapsed:3054msec # another run
Iterative elapsed:2653msec编译器可以有比递归更容易优化循环的时间。
#include <algorithm>
#include <functional>
#include <iostream>
#include <numeric>
#include <random>
#include <vector>
template<class It, class Func>
auto binary_fold_rec(It begin, It end, Func op) -> decltype(op(*begin, *end)) {
std::ptrdiff_t diff = end - begin;
switch (diff) {
case 0: throw std::out_of_range("binary fold on empty container");
case 1: return *begin;
case 2: return op(*begin, *(begin + 1));
default: { // first round to the nearest multiple of 2 and then advance
It mid{begin};
int div = diff/2;
int offset = (div%2 == 1) ? (div+1) : div; // round to the closest multiple of two (upwards)
std::advance(mid, offset);
return op( binary_fold_rec(begin,mid,op), binary_fold_rec(mid,end,op) );
}
}
}
// left-associative
template<class It, class Func>
auto binary_fold_it(It begin, It end, Func op) -> decltype(op(*begin, *end)) {
// Allocates enough scratch to begin with that we don't need to mess with again.
std::ptrdiff_t diff = end - begin;
std::vector<decltype(op(*begin, *end))> scratch (static_cast<int>(diff));
auto scratch_current = scratch.begin();
if(diff == 0) {
throw std::out_of_range("binary fold on empty container.");
}
while(diff > 1) {
auto fake_end = (diff & 1) ? end - 1 : end;
while(begin != fake_end) {
(*scratch_current++) = op(*begin, *(begin+1));
begin += 2; // silly c++ can't guarantee ++ order, so increment here.
}
if(fake_end != end) {
*scratch_current++ = *begin;
}
end = scratch_current;
begin = scratch_current = scratch.begin();
diff = end - begin;
};
return scratch[0];
}
void run(std::initializer_list<int> elems, int expected) {
std::vector<int> v(elems);
auto result = binary_fold_it(v.begin(), v.end(), std::minus<int>());
std::cout << result << std::endl;
assert(binary_fold_it(v.begin(), v.end(), std::minus<int>()) == expected);
}
constexpr int rolls = 10000;
constexpr int min_val = -1000;
constexpr int max_val = 1000;
constexpr int num_vals = 5000;
std::vector<int> random_vector() {
// Thanks http://stackoverflow.com/questions/21516575/fill-a-vector-with-random-numbers-c
// for saving me time.
std::uniform_int_distribution<int> distribution(min_val, max_val);
std::default_random_engine generator;
std::vector<int> data(num_vals);
std::generate(data.begin(), data.end(), [&]() { return distribution(generator); });
return data;
}
template<typename It, typename Func>
void evaluate(void(*func)(It, It, Func), const char* message) {
auto start = std::chrono::high_resolution_clock::now();
for(int i=0; i<rolls; i++) {
auto data = random_vector();
func(data.begin(), data.end(), std::minus<int>());
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << message << std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count() << "msec\n";
}
void evaluate(void(*func)(), const char* message) {
auto start = std::chrono::high_resolution_clock::now();
for(int i=0; i<rolls; i++) {
func();
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << message << std::chrono::duration_cast<std::chrono::milliseconds>(end-start).count() << "msec\n";
}
void time_it() {
auto data = random_vector();
binary_fold_it(data.begin(), data.end(), std::minus<int>());
}
void time_rec() {
auto data = random_vector();
binary_fold_rec(data.begin(), data.end(), std::minus<int>());
}
int main() {
evaluate(time_rec, "Recursive elapsed:");
evaluate(time_it, "Iterative elapsed:");
return 0;
}https://stackoverflow.com/questions/35318908
复制相似问题