文章/答案/技术大牛

发布

社区首页 >问答首页 >将8位扩展为0或1的8布尔字节的英特尔x86组装优化技术

问将8位扩展为0或1的8布尔字节的英特尔x86组装优化技术
EN

Stack Overflow用户

提问于 2009-09-12 11:28:46

回答 13查看 5K关注 0票数 24

我学习汇编程序很长一段时间，我试图重写一些简单的过程\函数，以看到性能的好处(如果有的话)。我的主要开发工具是Delphi 2007，第一个示例将使用该语言，但它们也可以轻松地被翻译成其他语言。

问题是：

我们给出了一个无符号字节值，其中八位中的每一位都表示屏幕一行中的一个像素。每个像素可以是实心(1)或透明(0)。换句话说，我们有8个像素被打包成一个字节值。我想把这些像素解压到一个8字节的数组中，就像最年轻的像素(位)在数组的最低索引下降落一样。下面是一个示例：

One byte value -----------> eight byte array

10011011 -----------------> [1][1][0][1][1][0][0][1]

Array index number ------->  0  1  2  3  4  5  6  7

下面我介绍了解决这个问题的五个方法。接下来，我将展示他们的时间比较，以及我如何测量那些时间。

我的问题包括两部分：

我请求您提供详细的关于方法、DecodePixels4a和DecodePixels4b的答案。为什么方法4b比4a慢一些

例如，如果由于我的代码对齐不正确，所以速度更慢，那么向我展示在给定方法中哪些指令可以更好地对齐，以及如何这样做以避免破坏该方法。

我希望看到这个理论背后的实际例子。请记住，我正在学习汇编语言，我想从你的答案中获得知识，这使我今后能够编写更好的优化代码。

你能写比DecodePixels4a更快的例程吗？如果是的话，请介绍它，并描述您所采取的优化步骤。所谓更快的例程，我指的是在您的测试环境中运行时间最短的例程，在这里介绍的所有例程中。

所有英特尔家族处理器都是允许的，以及那些与它们兼容的处理器。

下面是我写的例行公事：

procedure DecodePixels1(EncPixels: Byte; var DecPixels: TDecodedPixels);
var
  i3: Integer;
begin
  DecPixels[0] := EncPixels and $01;
  for i3 := 1 to 7 do
  begin
    EncPixels := EncPixels shr 1;
    DecPixels[i3] := EncPixels and $01;
    //DecPixels[i3] := (EncPixels shr i3) and $01;  //this is even slower if you replace above 2 lines with it
  end;
end;


//Lets unroll the loop and see if it will be faster.
procedure DecodePixels2(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
  DecPixels[0] := EncPixels and $01;
  EncPixels := EncPixels shr 1;
  DecPixels[1] := EncPixels and $01;
  EncPixels := EncPixels shr 1;
  DecPixels[2] := EncPixels and $01;
  EncPixels := EncPixels shr 1;
  DecPixels[3] := EncPixels and $01;
  EncPixels := EncPixels shr 1;
  DecPixels[4] := EncPixels and $01;
  EncPixels := EncPixels shr 1;
  DecPixels[5] := EncPixels and $01;
  EncPixels := EncPixels shr 1;
  DecPixels[6] := EncPixels and $01;
  EncPixels := EncPixels shr 1;
  DecPixels[7] := EncPixels and $01;
end;


procedure DecodePixels3(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
  asm
    push eax;
    push ebx;
    push ecx;
    mov bl, al;
    and bl, $01;
    mov [edx], bl;
    mov ecx, $00;
@@Decode:
    inc ecx;
    shr al, $01;
    mov bl, al;
    and bl, $01;
    mov [edx + ecx], bl;
    cmp ecx, $07;
    jnz @@Decode;
    pop ecx;
    pop ebx;
    pop eax;
  end;
end;


//Unrolled assembly loop
procedure DecodePixels4a(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
  asm
    push eax;
    push ebx;
    mov bl, al;
    and bl, $01;
    mov  [edx], bl;
    shr al, $01;
    mov bl, al;
    and bl, $01;
    mov [edx + $01], bl;
    shr al, $01;
    mov bl, al;
    and bl, $01;
    mov [edx + $02], bl;
    shr al, $01;
    mov bl, al;
    and bl, $01;
    mov [edx + $03], bl;
    shr al, $01;
    mov bl, al;
    and bl, $01;
    mov [edx + $04], bl;
    shr al, $01;
    mov bl, al;
    and bl, $01;
    mov [edx + $05], bl;
    shr al, $01;
    mov bl, al;
    and bl, $01;
    mov [edx + $06], bl;
    shr al, $01;
    mov bl, al;
    and bl, $01;
    mov [edx + $07], bl;
    pop ebx;
    pop eax;
  end;
end;


// it differs compared to 4a only in switching two instructions (but seven times)
procedure DecodePixels4b(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
  asm
    push eax;
    push ebx;
    mov bl, al;
    and bl, $01;
    shr al, $01;          //
    mov [edx], bl;        //
    mov bl, al;
    and bl, $01;
    shr al, $01;          //
    mov [edx + $01], bl;  //
    mov bl, al;
    and bl, $01;
    shr al, $01;          //
    mov [edx + $02], bl;  //
    mov bl, al;
    and bl, $01;
    shr al, $01;          //
    mov [edx + $03], bl;  //
    mov bl, al;
    and bl, $01;
    shr al, $01;          //
    mov [edx + $04], bl;  //
    mov bl, al;
    and bl, $01;
    shr al, $01;          //
    mov [edx + $05], bl;  //
    mov bl, al;
    and bl, $01;
    shr al, $01;          //
    mov [edx + $06], bl;  //
    mov bl, al;
    and bl, $01;
    mov [edx + $07], bl;
    pop ebx;
    pop eax;
  end;
end;

下面是我如何测试它们：

program Test;

{$APPTYPE CONSOLE}

uses
  SysUtils, Windows;

type
  TDecodedPixels = array[0..7] of Byte;
var
  Pixels: TDecodedPixels;
  Freq, TimeStart, TimeEnd :Int64;
  Time1, Time2, Time3, Time4a, Time4b: Extended;
  i, i2: Integer;

begin
  if QueryPerformanceFrequency(Freq) then
  begin
    for i2 := 1 to 100 do
    begin
      QueryPerformanceCounter(TimeStart);
      for i := 1 to 100000 do
        DecodePixels1(155, Pixels);
      QueryPerformanceCounter(TimeEnd);
      Time1 := Time1 + ((TimeEnd - TimeStart) / Freq * 1000);

      QueryPerformanceCounter(TimeStart);
      for i := 1 to 100000 do
        DecodePixels2(155, Pixels);
      QueryPerformanceCounter(TimeEnd);
      Time2 := Time2 + ((TimeEnd - TimeStart) / Freq * 1000);

      QueryPerformanceCounter(TimeStart);
      for i := 1 to 100000 do
        DecodePixels3(155, Pixels);
      QueryPerformanceCounter(TimeEnd);
      Time3 := Time3 + ((TimeEnd - TimeStart) / Freq * 1000);

      QueryPerformanceCounter(TimeStart);
      for i := 1 to 100000 do
        DecodePixels4a(155, Pixels);
      QueryPerformanceCounter(TimeEnd);
      Time4a := Time4a + ((TimeEnd - TimeStart) / Freq * 1000);

      QueryPerformanceCounter(TimeStart);
      for i := 1 to 100000 do
        DecodePixels4b(155, Pixels);
      QueryPerformanceCounter(TimeEnd);
      Time4b := Time4b + ((TimeEnd - TimeStart) / Freq * 1000);

    end;
    Writeln('Time1 : ' + FloatToStr(Time1 / 100) + ' ms.    <- Delphi loop.');
    Writeln('Time2 : ' + FloatToStr(Time2 / 100) + ' ms.    <- Delphi unrolled loop.');
    Writeln('Time3 : ' + FloatToStr(Time3/ 100) + ' ms.    <- BASM loop.');
    Writeln('Time4a : ' + FloatToStr(Time4a / 100) + ' ms.    <- BASM unrolled loop.');
    Writeln('Time4b : ' + FloatToStr(Time4b / 100) + ' ms.    <- BASM unrolled loop instruction switch.');
  end;
  Readln;
end.

下面是我的机器(Intel Pentium E2180 on Win32 XP)的结果：

Time1  : 1,68443549919493 ms.     <- Delphi loop.
Time2  : 1,33773024572211 ms.     <- Delphi unrolled loop.
Time3  : 1,37015271374424 ms.     <- BASM loop.
Time4a : 0,822916962526627 ms.    <- BASM unrolled loop.
Time4b : 0,862914462301607 ms.    <- BASM unrolled loop instruction switch.

结果是相当稳定的-在我做的每一次测试中，每次都会有很小的变化。这一直都是真的：Time1 > Time3 > Time 2 > Time4b > Time4a

因此，我认为Time4a和Time4b之间的差异取决于方法DecodePixels4b中的指令切换。有时是4%，有时高达10%，但4b总是比4a慢。

我正在考虑另一种方法，使用MMX指令一次写入内存8个字节，但我想不出将字节解压缩到64位寄存器的快速方法。

谢谢您抽时间见我。

谢谢你们的宝贵意见。虽然我可以同时回答大家，但不幸的是，与现代CPU相比，我只有一个“管道”，当时只能执行一条指令“回复”;-)因此，我将尝试在这里总结一些事情，并在你的回答下写上更多的评论。

首先，我想说的是，在发布我的问题之前，我想出了Wouter提出的解决方案，它实际上比我的汇编代码要慢得多。所以我决定不在这里发布这个例程，但是您可能会看到，我在循环Delphi版本的例程中也采用了同样的方法。这里有评论，因为它给了我更糟糕的结果。

这对我来说是个谜。我再次使用Wouter和PhilS的例程运行我的代码，结果如下：

Time1  : 1,66535493194387 ms.     <- Delphi loop.
Time2  : 1,29115785420688 ms.     <- Delphi unrolled loop.
Time3  : 1,33716934524107 ms.     <- BASM loop.
Time4a : 0,795041753757838 ms.    <- BASM unrolled loop.
Time4b : 0,843520166815013 ms.    <- BASM unrolled loop instruction switch.
Time5  : 1,49457681191307 ms.     <- Wouter van Nifterick, Delphi unrolled
Time6  : 0,400587402866258 ms.    <- PhiS, table lookup Delphi
Time7  : 0,325472442519827 ms.    <- PhiS, table lookup Delphi inline
Time8  : 0,37350491544239 ms.     <- PhiS, table lookup BASM

看看Time5的结果，是不是很奇怪？我想我有不同的Delphi版本，因为我生成的程序集代码不同于Wouter提供的程序集代码。

第二大编辑：

我知道为什么常规的5在我的机器上慢一些。我在编译器选项中检查了“范围检查”和“溢出检查”。我将assembler指令添加到常规9中，以查看它是否有用。使用此指令，组装过程似乎与Delphi内联变体一样好，甚至稍微好一些。

以下是最后的结果：

Time1  : 1,22508325749317 ms.     <- Delphi loop.
Time2  : 1,33004145373084 ms.     <- Delphi unrolled loop.
Time3  : 1,1473583622526 ms.      <- BASM loop.
Time4a : 0,77322594033463 ms.     <- BASM unrolled loop.
Time4b : 0,846033593023372 ms.    <- BASM unrolled loop instruction switch.
Time5  : 0,688689382044384 ms.    <- Wouter van Nifterick, Delphi unrolled
Time6  : 0,503233741036693 ms.    <- PhiS, table lookup Delphi
Time7  : 0,385254722925063 ms.    <- PhiS, table lookup Delphi inline
Time8  : 0,432993919452751 ms.    <- PhiS, table lookup BASM
Time9  : 0,362680491244212 ms.    <- PhiS, table lookup BASM with assembler directive

第三大编辑：

@Pascal和@j_random_hacker例程4a、4b和5之间执行时间的差异是由数据依赖引起的。然而，基于我所做的进一步测试，我不得不不同意这个观点。

我还发明了基于4c的新例程4a。下面是：

procedure DecodePixels4c(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
  asm
    push ebx;
    mov bl, al;
    and bl, 1;
    mov [edx], bl;
    mov bl, al;
    shr bl, 1;
    and bl, 1;
    mov [edx + $01], bl;
    mov bl, al;
    shr bl, 2;
    and bl, 1;
    mov [edx + $02], bl;
    mov bl, al;
    shr bl, 3;
    and bl, 1;
    mov [edx + $03], bl;
    mov bl, al;
    shr bl, 4;
    and bl, 1;
    mov [edx + $04], bl;
    mov bl, al;
    shr bl, 5;
    and bl, 1;
    mov [edx + $05], bl;
    mov bl, al;
    shr bl, 6;
    and bl, 1;
    mov [edx + $06], bl;
    shr al, 7;
    and al, 1;
    mov [edx + $07], al;
    pop ebx;
  end;
end;

我想说它是相当依赖于数据的。

这是测试和结果。我做了四次测试以确保没有意外。我还为GJ (Time10a，Time10b)提出的例程增加了新的时间。

          Test1  Test2  Test3  Test4

Time1   : 1,211  1,210  1,220  1,213
Time2   : 1,280  1,258  1,253  1,332
Time3   : 1,129  1,138  1,130  1,160

Time4a  : 0,690  0,682  0,617  0,635
Time4b  : 0,707  0,698  0,706  0,659
Time4c  : 0,679  0,685  0,626  0,625
Time5   : 0,715  0,682  0,686  0,679

Time6   : 0,490  0,485  0,522  0,514
Time7   : 0,323  0,333  0,336  0,318
Time8   : 0,407  0,403  0,373  0,354
Time9   : 0,352  0,378  0,355  0,355
Time10a : 1,823  1,812  1,807  1,813
Time10b : 1,113  1,120  1,115  1,118
Time10c : 0,652  0,630  0,653  0,633
Time10d : 0,156  0,155  0,172  0,160  <-- current winner!

如您所见，4a、4b、4c和5的结果非常接近。为什么会这样呢？因为我已经将从4a中删除了，所以4b (4c已经没有)有两个指令：push eax和pop eax。因为我知道我不会在代码中的任何地方使用eax下的值，所以我不必预先保存它。现在，我的代码只有一对push/pop，所以作为例程5。例程5预先保存了eax的值，因为它首先在ecx下复制它，但它没有预先保存ecx。

因此，我的结论是:5、4a和4b (在第三次编辑之前)的执行时间上的差异并不涉及数据依赖性，而是由附加的push / pop指令造成的。

我对你的评论很感兴趣。

几天后，GJ发明了比PhiS更快的例行公事(时间10d)。干得好，GJ！

x86

basm

delphi

optimization

assembly

回答 13

Stack Overflow用户

回答已采纳

发布于 2009-09-14 19:04:10

您的asm代码比较慢，因为使用堆栈结束将8次写入内存。看看这个..。

procedure DecodePixels(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
  xor   ecx, ecx
  add   al, al
  rcl   ecx, 8
  add   al, al
  rcl   ecx, 8
  add   al, al
  rcl   ecx, 8
  add   al, al
  rcl   ecx, 1
  mov   [DecPixels + 4], ecx
  xor   ecx, ecx
  add   al, al
  rcl   ecx, 8
  add   al, al
  rcl   ecx, 8
  add   al, al
  rcl   ecx, 8
  add   al, al
  rcl   ecx, 1
  mov   [DecPixels], ecx
end;

也许比使用查找表的代码还要快！

改进版本：

procedure DecodePixelsI(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
  mov   ecx, 0    //Faster than: xor   ecx, ecx
  add   al, al
  rcl   ch, 1
  add   al, al
  rcl   cl, 1
  ror   ecx, 16
  add   al, al
  rcl   ch, 1
  add   al, al
  rcl   cl, 1
  mov   [DecPixels + 4], ecx
  mov   ecx, 0    //Faster than: xor   ecx, ecx
  add   al, al
  rcl   ch, 1
  add   al, al
  rcl   cl, 1
  ror   ecx, 16
  add   al, al
  rcl   ch, 1
  add   al, al
  rcl   cl, 1
  mov   [DecPixels], ecx
end;

第3版：

procedure DecodePixelsX(EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
  add   al, al
  setc  byte ptr[DecPixels + 7]
  add   al, al
  setc  byte ptr[DecPixels + 6]
  add   al, al
  setc  byte ptr[DecPixels + 5]
  add   al, al
  setc  byte ptr[DecPixels + 4]
  add   al, al
  setc  byte ptr[DecPixels + 3]
  add   al, al
  setc  byte ptr[DecPixels + 2]
  add   al, al
  setc  byte ptr[DecPixels + 1]
  setnz byte ptr[DecPixels]
end;

第4版：

const Uint32DecPix : array [0..15] of cardinal = (
  $00000000, $00000001, $00000100, $00000101,
  $00010000, $00010001, $00010100, $00010101,
  $01000000, $01000001, $01000100, $01000101,
  $01010000, $01010001, $01010100, $01010101
  );

procedure DecodePixelsY(EncPixels: byte; var DecPixels: TDecodedPixels); inline;
begin
  pcardinal(@DecPixels)^ := Uint32DecPix[EncPixels and $0F];
  pcardinal(cardinal(@DecPixels) + 4)^ := Uint32DecPix[(EncPixels and $F0) shr 4];
end;

票数 6

Stack Overflow用户

发布于 2009-09-12 12:34:58

一般来说，我个人不会试图通过在汇编程序级使用技巧来优化代码，除非()您确实需要额外的2%或3%的速度，并且您愿意为代码的阅读、维护和移植付出代价。

为了压缩最后1%，您甚至可能需要维护每个处理器优化的几个版本，如果出现了更新的处理器和改进的pascal编译器，您将无法从中受益。

这个Delphi代码比最快的汇编程序代码更快的：

procedure DecodePixels5(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
  DecPixels[0] := (EncPixels shr 0) and $01;
  DecPixels[1] := (EncPixels shr 1) and $01;
  DecPixels[2] := (EncPixels shr 2) and $01;
  DecPixels[3] := (EncPixels shr 3) and $01;
  DecPixels[4] := (EncPixels shr 4) and $01;
  DecPixels[5] := (EncPixels shr 5) and $01;
  DecPixels[6] := (EncPixels shr 6) and $01;
  DecPixels[7] := (EncPixels shr 7) and $01;
end;


Results:

Time1  : 1,03096806151283 ms.    <- Delphi loop.
Time2  : 0,740308641141395 ms.   <- Delphi unrolled loop.
Time3  : 0,996602425688886 ms.   <- BASM loop.
Time4a : 0,608267951561275 ms.   <- BASM unrolled loop.
Time4b : 0,574162510648039 ms.   <- BASM unrolled loop instruction switch.
Time5  : 0,499628206138524 ms. !!!  <- Delphi unrolled loop 5.

它的速度很快，因为这些操作只能通过寄存器来完成，而不需要存储和获取内存。现代处理器部分并行地执行这个操作(新的操作可以在前面完成之前启动)，因为连续指令的结果是相互独立的。

机器代码如下所示：

  push ebx;
  // DecPixels[0] := (EncPixels shr 0) and 1;
  movzx ecx,al
  mov ebx,ecx
  //  shr ebx,$00
  and bl,$01
  mov [edx],bl
  // DecPixels[1] := (EncPixels shr 1) and 1;
  mov ebx,ecx
  shr ebx,1
  and bl,$01
  mov [edx+$01],bl
  // DecPixels[2] := (EncPixels shr 2) and 1;
  mov ebx,ecx
  shr ebx,$02
  and bl,$01
  mov [edx+$02],bl
  // DecPixels[3] := (EncPixels shr 3) and 1;
  mov ebx,ecx
  shr ebx,$03
  and bl,$01
  mov [edx+$03],bl
  // DecPixels[4] := (EncPixels shr 4) and 1;
  mov ebx,ecx
  shr ebx,$04
  and bl,$01
  mov [edx+$04],bl
  // DecPixels[5] := (EncPixels shr 5) and 1;
  mov ebx,ecx
  shr ebx,$05
  and bl,$01
  mov [edx+$05],bl
  // DecPixels[6] := (EncPixels shr 6) and 1;
  mov ebx,ecx
  shr ebx,$06
  and bl,$01
  mov [edx+$06],bl
  // DecPixels[7] := (EncPixels shr 7) and 1;
  shr ecx,$07
  and cl,$01
  mov [edx+$07],cl
  pop ebx;

编辑:正如建议的那样，表查找确实更快。

var
  PixelLookup:Array[byte] of TDecodedPixels;

// You could precalculate, but the performance gain would hardly be worth it because you call this once only.
for I := 0 to 255 do
  DecodePixels5b(I, PixelLookup[I]);


procedure DecodePixels7(EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
  DecPixels := PixelLookup[EncPixels];
end;

Results:

Time1  : 1,03096806151283 ms.    <- Delphi loop.
Time2  : 0,740308641141395 ms.   <- Delphi unrolled loop.
Time3  : 0,996602425688886 ms.   <- BASM loop.
Time4a : 0,608267951561275 ms.   <- BASM unrolled loop.
Time4b : 0,574162510648039 ms.   <- BASM unrolled loop instruction switch.
Time5  : 0,499628206138524 ms. !!!  <- Delphi unrolled loop 5.
Time7 : 0,251533475182096 ms.    <- simple table lookup

票数 16

Stack Overflow用户

发布于 2009-09-12 13:02:24

在Nick D的回答上，我尝试了下面的基于表查找的版本，所有的都比您给的实现要快(而且比Wouter的代码还要快)。

给定以下打包数组：

      const Uint64DecPix : PACKED ARRAY [0..255] OF UINT64 =
  ( $0000000000000000, $0000000000000001, $0000000000000100, $0000000000000101, $0000000000010000, $0000000000010001, $0000000000010100, $0000000000010101, $0000000001000000, $0000000001000001, $0000000001000100, $0000000001000101, $0000000001010000, $0000000001010001, $0000000001010100, $0000000001010101,
    $0000000100000000, $0000000100000001, $0000000100000100, $0000000100000101, $0000000100010000, $0000000100010001, $0000000100010100, $0000000100010101, $0000000101000000, $0000000101000001, $0000000101000100, $0000000101000101, $0000000101010000, $0000000101010001, $0000000101010100, $0000000101010101,
    $0000010000000000, $0000010000000001, $0000010000000100, $0000010000000101, $0000010000010000, $0000010000010001, $0000010000010100, $0000010000010101, $0000010001000000, $0000010001000001, $0000010001000100, $0000010001000101, $0000010001010000, $0000010001010001, $0000010001010100, $0000010001010101,
    $0000010100000000, $0000010100000001, $0000010100000100, $0000010100000101, $0000010100010000, $0000010100010001, $0000010100010100, $0000010100010101, $0000010101000000, $0000010101000001, $0000010101000100, $0000010101000101, $0000010101010000, $0000010101010001, $0000010101010100, $0000010101010101,
    $0001000000000000, $0001000000000001, $0001000000000100, $0001000000000101, $0001000000010000, $0001000000010001, $0001000000010100, $0001000000010101, $0001000001000000, $0001000001000001, $0001000001000100, $0001000001000101, $0001000001010000, $0001000001010001, $0001000001010100, $0001000001010101,
    $0001000100000000, $0001000100000001, $0001000100000100, $0001000100000101, $0001000100010000, $0001000100010001, $0001000100010100, $0001000100010101, $0001000101000000, $0001000101000001, $0001000101000100, $0001000101000101, $0001000101010000, $0001000101010001, $0001000101010100, $0001000101010101,
    $0001010000000000, $0001010000000001, $0001010000000100, $0001010000000101, $0001010000010000, $0001010000010001, $0001010000010100, $0001010000010101, $0001010001000000, $0001010001000001, $0001010001000100, $0001010001000101, $0001010001010000, $0001010001010001, $0001010001010100, $0001010001010101,
    $0001010100000000, $0001010100000001, $0001010100000100, $0001010100000101, $0001010100010000, $0001010100010001, $0001010100010100, $0001010100010101, $0001010101000000, $0001010101000001, $0001010101000100, $0001010101000101, $0001010101010000, $0001010101010001, $0001010101010100, $0001010101010101,
    $0100000000000000, $0100000000000001, $0100000000000100, $0100000000000101, $0100000000010000, $0100000000010001, $0100000000010100, $0100000000010101, $0100000001000000, $0100000001000001, $0100000001000100, $0100000001000101, $0100000001010000, $0100000001010001, $0100000001010100, $0100000001010101,
    $0100000100000000, $0100000100000001, $0100000100000100, $0100000100000101, $0100000100010000, $0100000100010001, $0100000100010100, $0100000100010101, $0100000101000000, $0100000101000001, $0100000101000100, $0100000101000101, $0100000101010000, $0100000101010001, $0100000101010100, $0100000101010101,
    $0100010000000000, $0100010000000001, $0100010000000100, $0100010000000101, $0100010000010000, $0100010000010001, $0100010000010100, $0100010000010101, $0100010001000000, $0100010001000001, $0100010001000100, $0100010001000101, $0100010001010000, $0100010001010001, $0100010001010100, $0100010001010101,
    $0100010100000000, $0100010100000001, $0100010100000100, $0100010100000101, $0100010100010000, $0100010100010001, $0100010100010100, $0100010100010101, $0100010101000000, $0100010101000001, $0100010101000100, $0100010101000101, $0100010101010000, $0100010101010001, $0100010101010100, $0100010101010101,
    $0101000000000000, $0101000000000001, $0101000000000100, $0101000000000101, $0101000000010000, $0101000000010001, $0101000000010100, $0101000000010101, $0101000001000000, $0101000001000001, $0101000001000100, $0101000001000101, $0101000001010000, $0101000001010001, $0101000001010100, $0101000001010101,
    $0101000100000000, $0101000100000001, $0101000100000100, $0101000100000101, $0101000100010000, $0101000100010001, $0101000100010100, $0101000100010101, $0101000101000000, $0101000101000001, $0101000101000100, $0101000101000101, $0101000101010000, $0101000101010001, $0101000101010100, $0101000101010101,
    $0101010000000000, $0101010000000001, $0101010000000100, $0101010000000101, $0101010000010000, $0101010000010001, $0101010000010100, $0101010000010101, $0101010001000000, $0101010001000001, $0101010001000100, $0101010001000101, $0101010001010000, $0101010001010001, $0101010001010100, $0101010001010101,
    $0101010100000000, $0101010100000001, $0101010100000100, $0101010100000101, $0101010100010000, $0101010100010001, $0101010100010100, $0101010100010101, $0101010101000000, $0101010101000001, $0101010101000100, $0101010101000101, $0101010101010000, $0101010101010001, $0101010101010100, $0101010101010101);
PUint64DecPix : pointer = @Uint64DecPix;

您可以编写以下内容：

procedure DecodePixelsPS1Pas (EncPixels: Byte; var DecPixels: TDecodedPixels);
begin
  DecPixels := TDecodedPixels(Uint64DecPix[EncPixels]);
end;

procedure DecodePixelsPS1PasInline (EncPixels: Byte; var DecPixels: TDecodedPixels);
inline;
begin
  DecPixels := TDecodedPixels(Uint64DecPix[EncPixels]);
end;

procedure DecodePixelsPS1Asm (EncPixels: Byte; var DecPixels: TDecodedPixels);
asm
  lea ecx, Uint64DecPix //[<-Added in EDIT 3] 
  //mov ecx, dword ptr PUint64DecPix - alternative to the above line (slower for me)
  movzx eax, al
  movq xmm0, [8*eax+ecx]  //Using XMM rather than MMX so we don't have to issue emms at the end
  movq [edx], xmm0        //use MOVQ because it doesn't need mem alignment
end;


The standard PAS and ASM implementations are fairly similar speed-wise, but the PAS implementation marked with "INLINE" is the fastest because it gets rid of all the call/ret involved in calling the routine.

--EDIT--: I forgot to say: since you are implicitly assuming something about the memory layout of your TDecodedPixels structure, it would be better if you declare it as 

PACKED ARRAY [0..7] of byte


--EDIT2--:
Here are my results for comparison:

Time1 : 2.51638266874701 ms.    <- Delphi loop.
Time2 : 2.11277620479698 ms.    <- Delphi unrolled loop.
Time3 : 2.21972066282167 ms.    <- BASM loop.
Time4a : 1.34093090043567 ms.    <- BASM unrolled loop.
Time4b : 1.52222070123437 ms.    <- BASM unrolled loop instruction switch.
Time5 : 1.17106364076999 ms.    <- Wouter van Nifterick
TimePS1 : 0.633099318488802 ms.    <- PS.Pas
TimePS2 : 0.551617593856202 ms.    <- PS.Pas Inline
TimePS3 : 0.70921094720139 ms.    <- PS.Asm (speed for version before 3rd EDIT)

票数 5

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/1414911

复制

相似问题

问将8位扩展为0或1的8布尔字节的英特尔x86组装优化技术
EN

回答 13

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问将8位扩展为0或1的8布尔字节的英特尔x86组装优化技术EN

回答 13

Stack Overflow用户

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问将8位扩展为0或1的8布尔字节的英特尔x86组装优化技术
EN