我想优化以下代码:
#ifdef USE_SSE
#define STRING_PREFETCH_TBL(ptr) \
_mm_prefetch(ptr, _MM_HINT_T0); \
_mm_prefetch(ptr+64, _MM_HINT_T0); \
_mm_prefetch(ptr+128, _MM_HINT_T0); \
_mm_prefetch(ptr+192, _MM_HINT_T0)
#else
#define STRING_PREFETCH_TBL(ptr)
#endif
__declspec(align(128)) const char TblToLower[] =
{
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,
35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,97,98,
99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,91,
92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,
119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,
170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,
197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,
225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,
};
void StrToLowerCase( const char* src, char* dst )
{
STRING_PREFETCH_TBL(TblToLower);
while(*src)
{
*dst = TblToLower[*src];
dst++;
src++;
}
*dst = '\0';
}发布于 2013-09-10 10:04:31
这是Visual 2012在没有SSE支持的情况下为我生成的/O2:
_StrToLowerCase:
00000000: 8B 54 24 04 mov edx,dword ptr [esp+4]
00000004: 8B 44 24 08 mov eax,dword ptr [esp+8]
00000008: 8A 0A mov cl,byte ptr [edx]
0000000A: 84 C9 test cl,cl
0000000C: 74 19 je 00000027
0000000E: 2B D0 sub edx,eax
00000010: 0F BE C9 movsx ecx,cl
00000013: 8D 40 01 lea eax,[eax+1]
00000016: 0F B6 89 00 00 00 movzx ecx,byte ptr _TblToLower[ecx]
00
0000001D: 88 48 FF mov byte ptr [eax-1],cl
00000020: 8A 0C 02 mov cl,byte ptr [edx+eax]
00000023: 84 C9 test cl,cl
00000025: 75 E9 jne 00000010
00000027: C6 00 00 mov byte ptr [eax],0
0000002A: C3 ret...and这是与SSE一起使用的:
_StrToLowerCase:
00000000: 8B 54 24 04 mov edx,dword ptr [esp+4]
00000004: B8 00 00 00 00 mov eax,offset _TblToLower
00000009: 8A 0A mov cl,byte ptr [edx]
0000000B: 0F 18 08 prefetcht0 [eax]
0000000E: B8 40 00 00 00 mov eax,offset _TblToLower+40h
00000013: 0F 18 08 prefetcht0 [eax]
00000016: B8 80 00 00 00 mov eax,offset _TblToLower+80h
0000001B: 0F 18 08 prefetcht0 [eax]
0000001E: B8 C0 00 00 00 mov eax,offset _TblToLower+0C0h
00000023: 0F 18 08 prefetcht0 [eax]
00000026: 8B 44 24 08 mov eax,dword ptr [esp+8]
0000002A: 84 C9 test cl,cl
0000002C: 74 19 je 00000047
0000002E: 2B D0 sub edx,eax
00000030: 0F BE C9 movsx ecx,cl
00000033: 8D 40 01 lea eax,[eax+1]
00000036: 0F B6 89 00 00 00 movzx ecx,byte ptr _TblToLower[ecx]
00
0000003D: 88 48 FF mov byte ptr [eax-1],cl
00000040: 8A 0C 02 mov cl,byte ptr [edx+eax]
00000043: 84 C9 test cl,cl
00000045: 75 E9 jne 00000030
00000047: C6 00 00 mov byte ptr [eax],0
0000004A: C3 ret对我来说已经很理想了。所以我试着:
作为最下面的说明,我可能会将函数的主循环编写为
while ( *src ) *dst++ = TblToLower[*src++];..and将TblToLower表声明为函数内部的static,以提高(read: reduce)的可见性范围。也许这甚至提高了缓存的局部性?
https://codereview.stackexchange.com/questions/31036
复制相似问题