问:是否有"Dial_A_Bit“基排序,可用于对数据类型中的位子集进行排序?
uint64 *Radix_Sort_Dial_A_Bit(uint64_t *a, int num_a, int sort_bits);特别是,对64位数据进行38位排序是否可能,并且速度在32/64和48/64之间,如下图所示?
uint64 *Radix_Sort_ui64_38MSB(uint64_t *a, int num_a);请注意,与uint64_t[]中所有64位排序相比,对48位和32位排序的研究已经证实了速度和正确性的提高。
似乎对数据包大小的子集进行排序的Radix_Sort()通常是有用和有效的,只对所需的内容进行排序。
在某些情况下,需要对每个像素计算结果并对其进行排序。uint64_t[]用于保存计算结果和XY位置。
26总比特(X为13位,Y为8192 max分辨率)以保持像素XY坐标,这将为待排序的数据留下38位。
可以使用Radix_Sort_Uint64()对整个64位包进行排序。
一个更快的方法是使用Radix_Sort_Uint48() (见下文),因此排序中不考虑最后16位。这将正确地对所有数据进行排序,并对不需要的13 X坐标位中的10位进行排序。
由于性能几乎与位排序成线性比例,因此,在排序中只考虑38位最重要的位。
即使是40位基数排序也比使用48位要好。我试图将工作的48位基数排序概括为对40位进行操作,但它没有正确排序。
QSort_uint64_38_msb():
static inline int comp_uint64_38_msb(const void *a, const void *b) {
register int64_t ca, cb;
ca = (*((uint64_t *)a)) >> 26; // Send 26 LSBs to oblivion
cb = (*((uint64_t *)b)) >> 26; // Send 26 LSBs to oblivion
return((ca > cb) - (ca < cb)); // Calcs to [+1, 0 -1]
}如您所见,48位排序比完整的64位要快得多。32位排序几乎是完整64位的两倍。而qsort则远远落后于:
Time= 2.136 sec = 3.390%, RADIX_SORT_FFFF0000 , hits=4, 0.534 sec each
Time= 2.944 sec = 4.672%, RADIX_SORT_FFFFFF00 , hits=4, 0.736 sec each
Time= 4.691 sec = 7.444%, RADIX_SORT_64 , hits=5, 0.938 sec each
Time= 25.209 sec = 40.010%, QSORT_UINT64_ARRAY , hits=4, 6.302 sec each
Time= 26.191 sec = 41.569%, QSORT_UINT64_38_ARRAY, hits=4, 6.548 sec each从64、48和32位到38位的线性化结果:
lsf 64 0.938 48 0.736 32 0.534 38 -> 0.650038位Radix_Sort比64位排序快35%,比48位排序快17%。
即使是40位也会更快,每uint64处理5个字节和6个字节。
=========
最快的,8字节类型的uint64[]中的6种,泛化自:32 MSbits used to sort uint64
// #############################################################################
// From: http://ideone.com/JHI0d9
// RadixSort--- for 48 MSB of uint64
typedef union {
struct {
uint32_t c6[256];
uint32_t c5[256];
uint32_t c4[256];
uint32_t c3[256];
uint32_t c2[256];
uint32_t c1[256];
};
uint32_t counts[256 * 6];
} rscounts6_t;
// #############################################################################
// Patterned off of Radix_Sort_64 but looks only at the 48 MostSigBits
// 0XFFFF-FFFF-FFFF-0000 << Ignore the zeros, sort on 3 MostSigBytes
// Made for RGB48 stuffed into uint64 with 2 LeastSig bytes zero
// Get rid of the 7 and 8 level comps
uint64_t *radix_sort_48_msb(uint64_t *arrayA, uint32_t asize)
{
register uint64_t *array=arrayA; // Slam arg into Register!
register int ii; // Loop control
rscounts6_t counts;
memset(&counts, 0, 256 * 6 * sizeof(uint32_t));
uint64_t *cpy = (uint64_t *)malloc(asize * sizeof(uint64_t));
uint32_t o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
uint32_t t6, t5, t4, t3, t2, t1;
register uint32_t x;
// calculate counts
for(x = 0; x < asize; x++) {
t6 = (array[x] >> 16) & 0xff;
t5 = (array[x] >> 24) & 0xff;
t4 = (array[x] >> 32) & 0xff;
t3 = (array[x] >> 40) & 0xff;
t2 = (array[x] >> 48) & 0xff;
t1 = (array[x] >> 56) & 0xff;
counts.c6[t6]++;
counts.c5[t5]++;
counts.c4[t4]++;
counts.c3[t3]++;
counts.c2[t2]++;
counts.c1[t1]++;
}
// convert counts to offsets
for(x = 0; x < 256; x++) {
t6 = o6 + counts.c6[x];
t5 = o5 + counts.c5[x];
t4 = o4 + counts.c4[x];
t3 = o3 + counts.c3[x];
t2 = o2 + counts.c2[x];
t1 = o1 + counts.c1[x];
counts.c6[x] = o6;
counts.c5[x] = o5;
counts.c4[x] = o4;
counts.c3[x] = o3;
counts.c2[x] = o2;
counts.c1[x] = o1;
o6 = t6;
o5 = t5;
o4 = t4;
o3 = t3;
o2 = t2;
o1 = t1;
}
// radix
for(x = 0; x < asize; x++) {
t6 = (array[x] >> 16) & 0xff;
cpy[counts.c6[t6]] = array[x];
counts.c6[t6]++; }
for(x = 0; x < asize; x++) {
t5 = (cpy[x] >> 24) & 0xff;
array[counts.c5[t5]] = cpy[x];
counts.c5[t5]++; }
for(x = 0; x < asize; x++) {
t4 = (array[x] >> 32) & 0xff;
cpy[counts.c4[t4]] = array[x];
counts.c4[t4]++; }
for(x = 0; x < asize; x++) {
t3 = (cpy[x] >> 40) & 0xff;
array[counts.c3[t3]] = cpy[x];
counts.c3[t3]++; }
for(x = 0; x < asize; x++) {
t2 = (array[x] >> 48) & 0xff;
cpy[counts.c2[t2]] = array[x];
counts.c2[t2]++; }
for(x = 0; x < asize; x++) {
t1 = (cpy[x] >> 56) & 0xff;
array[counts.c1[t1]] = cpy[x];
counts.c1[t1]++; }
free(cpy);
return array;
} // End radix_sort_48_msb().==================================
再次感谢Rcgldr的创新编程建议!而不是10,10,9,9,我使用了快速32位模式与4。
它可以工作,但比48 MSB排序慢一些,40 MSBt为737 msec,48 MSB为588 msec。:(
也许我的编码很差。
Time= 6.108 sec = 33.668%, QSORT_UINT64_ARRAY , hits=1
Time= 3.060 sec = 16.866%, RADIX_SORT_UINT64_REG, hits=4, 0.765 sec each
Time= 2.947 sec = 16.241%, RADIX_SORT_UINT64_40R, hits=4, 0.737 sec each < SLOW
Time= 2.354 sec = 12.973%, RADIX_SORT_UINT64_48R, hits=4, 0.588 sec each
Time= 1.542 sec = 8.498%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each
Time= 0.769 sec = 4.236%, RADIX_SORT_64 , hits=1测试:
之后,将每个测试排序与标准进行比较
以下是代码:
//=============================================================================
// From code submitted by rcgldr, Feb 8 2020
// Optimized to use Registers and to sort on 40 MSBs, ignoring 24 LSBs
void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[4][1024] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i];
mIndex[3][(u >> 24) & 0x3ff]++;
mIndex[2][(u >> 34) & 0x3ff]++;
mIndex[1][(u >> 44) & 0x3ff]++;
mIndex[0][(u >> 54) & 0x3ff]++;
}
for (j = 0; j < 4; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;
}
} // End Radix_Sort_R64_40().这里是FAST 32 MSB版本和克隆的40 MSB慢速版本之间的唯一dif。
Unique lines from "~/tmp/radix.sort.32.c":
02) void radix_sort_r64_32(uint64_t *pData, uint64_t *pTemp, size_t count,
05) size_t mIndex[4][256] = { 0 }; /* index matrix */
09) if(tsa) time_event(E_RADIX_SORT_UINT64_32R, tsa, E_TIME_EVENT, 1, 0);
13) mIndex[3][(u >> 32) & 0xff]++; // B4
14) mIndex[2][(u >> 40) & 0xff]++; // B5
15) mIndex[1][(u >> 48) & 0xff]++; // B6
16) mIndex[0][(u >> 56) & 0xff]++; // B7
22) for (i = 0; i < 256; i++) {
31) pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
35) pData[mIndex[2][(u >> 40) & 0xff]++] = u;
39) pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
43) pData[mIndex[0][(u >> 56) & 0xff]++] = u;
Unique lines from "~/tmp/radix.sort.40.c":
01) void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[4][1024] = { 0 }; /* index matrix */
08) if(tsa) time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);
12) mIndex[3][(u >> 24) & 0x3ff]++;
13) mIndex[2][(u >> 34) & 0x3ff]++;
14) mIndex[1][(u >> 44) & 0x3ff]++;
15) mIndex[0][(u >> 54) & 0x3ff]++;
21) for (i = 0; i < 1024; i++) {
30) pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
34) pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
38) pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
42) pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;发布于 2020-02-08 17:14:50
使用{10位、10位、9位、9位}计数==偏移量,可以在4次传递中完成38 msb的基排序。将代码更改为使用大小: c11024、c21024、c3512、c4512和计数初始化,以使用移位和掩码{(...>>54)&0x3ff、(...>>44)和0x3ff、(...>>35)&0x1ff、(...>>26)和0x1ff},并对代码的其余部分进行类似的更改。
尝试使用此代码进行比较。要么更改代码,要么将计时器内容添加到此代码中:
void RadixSort(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t mIndex[4][1024] = { 0 }; /* index matrix */
uint32_t * pmIndex; /* ptr to row of matrix */
uint32_t i, j, m, n;
uint64_t u;
for (i = 0; i < count; i++) { /* generate histograms */
u = a[i];
mIndex[3][(u >> 26) & 0x1ff]++;
mIndex[2][(u >> 35) & 0x1ff]++;
mIndex[1][(u >> 44) & 0x3ff]++;
mIndex[0][(u >> 54) & 0x3ff]++;
}
for (j = 0; j < 2; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (j = 2; j < 4; j++) {
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 512; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
pmIndex = mIndex[3];
for (i = 0; i < count; i++) { /* radix sort */
u = a[i];
b[pmIndex[(u >> 26) & 0x1ff]++] = u;
}
pmIndex = mIndex[2];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 35) & 0x1ff]++] = u;
}
pmIndex = mIndex[1];
for (i = 0; i < count; i++) {
u = a[i];
b[pmIndex[(u >> 44) & 0x3ff]++] = u;
}
pmIndex = mIndex[0];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 54) & 0x3ff]++] = u;
}
}更快的方法还是先按最重要的10位进行排序,创建1024个回收箱,然后对1024个回收箱,{10,9,9}位字段,最不重要的位进行排序。这将加快排序,因为每个1024回收箱都适合高速缓存,减少了所有随机访问写入的开销。注意- aIndex的大小为1025,以跟踪最后一个垃圾箱的大小。
void RadixSort3(uint64_t *, uint64_t *, size_t);
/* split array into 1024 bins according to most significant 10 bits */
void RadixSort(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t aIndex[1025] = {0}; /* index array */
uint32_t i, m, n;
for(i = 0; i < count; i++) /* generate histogram */
aIndex[(a[i] >> 54)]++;
n = 0; /* convert to indices */
for (i = 0; i < 1025; i++) {
m = aIndex[i];
aIndex[i] = n;
n += m;
}
for(i = 0; i < count; i++) /* sort by ms 10 bits */
b[aIndex[a[i]>>54]++] = a[i];
for(i = 1024; i; i--) /* restore aIndex */
aIndex[i] = aIndex[i-1];
aIndex[0] = 0;
for(i = 0; i < 1024; i++) /* radix sort the 1024 bins */
RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
}
void RadixSort3(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t mIndex[3][1024] = { 0 }; /* index matrix */
uint32_t * pmIndex; /* ptr to row of matrix */
uint32_t i, j, m, n;
uint64_t u;
for (i = 0; i < count; i++) { /* generate histograms */
u = a[i];
mIndex[2][(u >> 26) & 0x1ff]++;
mIndex[1][(u >> 35) & 0x1ff]++;
mIndex[0][(u >> 44) & 0x3ff]++;
}
for (j = 0; j < 1; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (j = 1; j < 3; j++) {
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 512; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
pmIndex = mIndex[2];
for (i = 0; i < count; i++) { /* radix sort */
u = a[i];
b[pmIndex[(u >> 26) & 0x1ff]++] = u;
}
pmIndex = mIndex[1];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 35) & 0x1ff]++] = u;
}
pmIndex = mIndex[0];
for (i = 0; i < count; i++) {
u = a[i];
b[pmIndex[(u >> 44) & 0x3ff]++] = u;
}
}发布于 2020-02-08 19:46:31
40 MSB排序性能非常差,远远慢于48 MSB版本。
因此,我尝试了一个6,36 MSB版本,模仿相对较快的48 MSB。
我知道我想要38块,而不是36块。优化的方法是如何将XY位置和那个点上的属性打包到一个uint64_t中,使用8kX和Y同弦,取13+13位作为XY,而数据不超过64-26=38位。
当前数据的最大值为34或35位,因此36位应该可以工作。
下面是表演:
Time= 6.104 sec = 30.673%, QSORT_UINT64_ARRAY , hits=1
Time= 3.117 sec = 15.663%, RADIX_SORT_UINT64_REG, hits=4, 0.779 sec each
Time= 2.931 sec = 14.731%, RADIX_SORT_UINT64_40R, hits=4, 0.733 sec each
Time= 2.269 sec = 11.401%, RADIX_SORT_UINT64_48R, hits=4, 0.567 sec each
Time= 1.663 sec = 8.359%, RADIX_SORT_UINT64_36R, hits=4, 0.416 sec each < FAST
Time= 1.516 sec = 7.620%, RADIX_SORT_UINT64_32R, hits=4, 0.379 sec each
Time= 0.734 sec = 3.689%, RADIX_SORT_64 , hits=1它比48位代码快27%。
而且,如果36位变得太紧,它看起来应该可以扩展到6来对42 MSB进行排序!
下面是完整的代码:
void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[6][64] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_36R, tsa, E_TIME_EVENT, 1, 0);
// 64 -- 56 48 40 32 24 16 -- 8 bits each
// 64 -- 58 52 46 40 34 28 -- 6 bits each
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i]; // Igonores Nibbles 0, 1 & 2
mIndex[5][(u >> 28) & 0x3F]++; // N2
mIndex[4][(u >> 34) & 0x3F]++; // N3
mIndex[3][(u >> 40) & 0x3F]++; // N4
mIndex[2][(u >> 46) & 0x3F]++; // N5
mIndex[1][(u >> 52) & 0x3F]++; // N6
mIndex[0][(u >> 58) & 0x3F]++; // N7
}
for (j = 0; j < 6; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 64; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 58) & 0x3F]++] = u;
}
} // End Radix_Sort_R64_36().具有48 MSB功能的唯一dif:
Unique lines from "/home/brianp/tmp/radix.sort.36.c":
01) void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][64] = { 0 }; /* index matrix */
08) if(tsa) time_event(E_RADIX_SORT_UINT64_36R, tsa, E_TIME_EVENT, 1, 0);
11) mIndex[5][(u >> 28) & 0x3F]++; // N2
12) mIndex[4][(u >> 34) & 0x3F]++; // N3
13) mIndex[3][(u >> 40) & 0x3F]++; // N4
14) mIndex[2][(u >> 46) & 0x3F]++; // N5
15) mIndex[1][(u >> 52) & 0x3F]++; // N6
16) mIndex[0][(u >> 58) & 0x3F]++; // N7
22) for (i = 0; i < 64; i++) {
31) pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
35) pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
39) pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
43) pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
47) pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
51) pData[mIndex[0][(u >> 58) & 0x3F]++] = u;
Unique lines from "/home/brianp/tmp/radix.sort.48.c":
01) void radix_sort_r64_48(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][256] = { 0 }; /* index matrix */
08) if(tsa) time_event(E_RADIX_SORT_UINT64_48R, tsa, E_TIME_EVENT, 1, 0);
14) mIndex[5][(u >> 16) & 0xff]++; // B2
15) mIndex[4][(u >> 24) & 0xff]++; // B3
16) mIndex[3][(u >> 32) & 0xff]++; // B4
17) mIndex[2][(u >> 40) & 0xff]++; // B5
18) mIndex[1][(u >> 48) & 0xff]++; // B6
19) mIndex[0][(u >> 56) & 0xff]++; // B7
25) for (i = 0; i < 256; i++) {
34) pTemp[mIndex[5][(u >> 16) & 0xff]++] = u;
38) pData[mIndex[4][(u >> 24) & 0xff]++] = u;
42) pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
46) pData[mIndex[2][(u >> 40) & 0xff]++] = u;
50) pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
54) pData[mIndex[0][(u >> 56) & 0xff]++] = u;发布于 2020-02-09 18:06:56
为了完整起见,6 bin,7 bit/bin 42 MSB排序工作,性能与48 MSB和36 MSB版本一致。
Time= 6.334 sec = 25.435%, QSORT_UINT64_ARRAY , hits=1
Time= 3.519 sec = 14.131%, RADIX_SORT_UINT64_REG, hits=4, 0.880 sec each
Time= 3.273 sec = 13.145%, RADIX_SORT_UINT64_40R, hits=4, 0.818 sec each < anomaly
Time= 2.680 sec = 10.764%, RADIX_SORT_UINT64_48R, hits=4, 0.670 sec each
Time= 2.302 sec = 9.246%, RADIX_SORT_UINT64_42R, hits=4, 0.576 sec each < NEW
Time= 2.025 sec = 8.132%, RADIX_SORT_UINT64_36R, hits=4, 0.506 sec each
Time= 1.767 sec = 7.094%, RADIX_SORT_UINT64_32R, hits=4, 0.442 sec each
Time= 0.955 sec = 3.835%, RADIX_SORT_64 , hits=1有人能解释为什么40 MSB比48 MSB慢得多吗?
完整法典:
void radix_sort_r64_42(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[6][128] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_42R, tsa, E_TIME_EVENT, 1, 0);
// 64 -- 56 48 40 32 24 16 -- 8 bits each
// 64 -- 57 50 43 36 29 22 -- 7 bits each
// 64 -- 58 52 46 40 34 28 -- 6 bits each
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i]; // Igonores Nibbles 0, 1 & 2
mIndex[5][(u >> 22) & 0x7F]++; // N2
mIndex[4][(u >> 29) & 0x7F]++; // N3
mIndex[3][(u >> 36) & 0x7F]++; // N4
mIndex[2][(u >> 43) & 0x7F]++; // N5
mIndex[1][(u >> 50) & 0x7F]++; // N6
mIndex[0][(u >> 57) & 0x7F]++; // N7
}
for (j = 0; j < 6; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 128; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[5][(u >> 22) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[4][(u >> 29) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[3][(u >> 36) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 43) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 50) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 57) & 0x7F]++] = u;
}
} // End Radix_Sort_R64_42().添加36 MSB对42 MSB的差异版本
Unique lines from "~/tmp/radix.sort.36.c":
01) void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][64] = { 0 }; /* index matrix */
11) mIndex[5][(u >> 28) & 0x3F]++; // N2
12) mIndex[4][(u >> 34) & 0x3F]++; // N3
13) mIndex[3][(u >> 40) & 0x3F]++; // N4
14) mIndex[2][(u >> 46) & 0x3F]++; // N5
15) mIndex[1][(u >> 52) & 0x3F]++; // N6
16) mIndex[0][(u >> 58) & 0x3F]++; // N7
22) for (i = 0; i < 64; i++) {
31) pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
35) pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
39) pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
43) pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
47) pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
51) pData[mIndex[0][(u >> 58) & 0x3F]++] = u;
19 Unique lines from "~/tmp/radix.sort.42.c":
01) void radix_sort_r64_42(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][128] = { 0 }; /* index matrix */
10) // 64 -- 56 48 40 32 24 16 -- 8 bits each
11) // 64 -- 57 50 43 36 29 22 -- 7 bits each
12) // 64 -- 58 52 46 40 34 28 -- 6 bits each
15) mIndex[5][(u >> 22) & 0x7F]++; // N2
16) mIndex[4][(u >> 29) & 0x7F]++; // N3
17) mIndex[3][(u >> 36) & 0x7F]++; // N4
18) mIndex[2][(u >> 43) & 0x7F]++; // N5
19) mIndex[1][(u >> 50) & 0x7F]++; // N6
20) mIndex[0][(u >> 57) & 0x7F]++; // N7
26) for (i = 0; i < 128; i++) {
35) pTemp[mIndex[5][(u >> 22) & 0x7F]++] = u;
39) pData[mIndex[4][(u >> 29) & 0x7F]++] = u;
43) pTemp[mIndex[3][(u >> 36) & 0x7F]++] = u;
47) pData[mIndex[2][(u >> 43) & 0x7F]++] = u;
51) pTemp[mIndex[1][(u >> 50) & 0x7F]++] = u;
55) pData[mIndex[0][(u >> 57) & 0x3F]++] = u;https://stackoverflow.com/questions/59954204
复制相似问题