Step to UEFI (176)memset的实现方法

之前的文章“哪里来的的 memset”【参考1】提到过因为编译器擅作主张使用memset优化引起了很诡异的问题。可以通过关闭编译优化来避免错误,这里从代码的角度分析 EDK2 是如何实现 memset 功能的。

  1. \MdePkg\Library\BaseMemoryLib\MemLibGeneric.c 提供了三个函数

InternalMemSetMem16

 InternalMemSetMem32

InternalMemSetMem64

以 InternalMemSetMem16  为例:

/**
  Fills a target buffer with a 16-bit value, and returns the target buffer.

  @param  Buffer  The pointer to the target buffer to fill.
  @param  Length  The count of 16-bit value to fill.
  @param  Value   The value with which to fill Length bytes of Buffer.

  @return Buffer

**/
VOID *
EFIAPI
InternalMemSetMem16 (
  OUT     VOID                      *Buffer,
  IN      UINTN                     Length,
  IN      UINT16                    Value
  )
{
  for (; Length != 0; Length--) {
    ((UINT16*)Buffer)[Length - 1] = Value;
  }
  return Buffer;
}

看起来for (; Length != 0; Length--) 这样的定义足够“迷惑”编译器避免优化。

2. \MdePkg\Library\BaseMemoryLib\SetMem.c 提供了InternalMemSetMem()

/**
  Set Buffer to Value for Size bytes.

  @param  Buffer   The memory to set.
  @param  Length   The number of bytes to set.
  @param  Value    The value of the set operation.

  @return Buffer

**/
VOID *
EFIAPI
InternalMemSetMem (
  OUT     VOID                      *Buffer,
  IN      UINTN                     Length,
  IN      UINT8                     Value
  )
{
  //
  // Declare the local variables that actually move the data elements as
  // volatile to prevent the optimizer from replacing this function with
  // the intrinsic memset()
  //
  volatile UINT8                    *Pointer8;
  volatile UINT32                   *Pointer32;
  volatile UINT64                   *Pointer64;
  UINT32                            Value32;
  UINT64                            Value64;

  if ((((UINTN)Buffer & 0x7) == 0) && (Length >= 8)) {
    // Generate the 64bit value
    Value32 = (Value << 24) | (Value << 16) | (Value << 8) | Value;
    Value64 = LShiftU64 (Value32, 32) | Value32;

    Pointer64 = (UINT64*)Buffer;
    while (Length >= 8) {
      *(Pointer64++) = Value64;
      Length -= 8;
    }

    // Finish with bytes if needed
    Pointer8 = (UINT8*)Pointer64;
  } else if ((((UINTN)Buffer & 0x3) == 0) && (Length >= 4)) {
    // Generate the 32bit value
    Value32 = (Value << 24) | (Value << 16) | (Value << 8) | Value;

    Pointer32 = (UINT32*)Buffer;
    while (Length >= 4) {
      *(Pointer32++) = Value32;
      Length -= 4;
    }

    // Finish with bytes if needed
    Pointer8 = (UINT8*)Pointer32;
  } else {
    Pointer8 = (UINT8*)Buffer;
  }
  while (Length-- > 0) {
    *(Pointer8++) = Value;
  }
  return Buffer;
}

避免被编译器优化的方法和上面的类似,此外还可以看出这个函数特地用 8 bytes填充提升效率。

3. \MdePkg\Library\UefiMemoryLib\MemLib.c 中的InternalMemSetMem 函数直接调用 gBS 提供的服务

/**
  Fills a target buffer with a byte value, and returns the target buffer.

  This function wraps the gBS->SetMem().

  @param  Buffer    Memory to set.
  @param  Size      The number of bytes to set.
  @param  Value     Value of the set operation.

  @return Buffer.

**/
VOID *
EFIAPI
InternalMemSetMem (
  OUT     VOID                      *Buffer,
  IN      UINTN                     Size,
  IN      UINT8                     Value
  )
{
  gBS->SetMem (Buffer, Size, Value);
  return Buffer;
}

4. 通过volatile 申明变量避免编译器的优化,简单粗暴,很前面2提到的没有本质差别。volatile是一个类型修饰符(type specifier).volatile的作用是作为指令关键字,确保本条指令不会因编译器的优化而省略,且要求每次直接读值。volatile的变量是说这变量可能会被意想不到地改变,这样,编译器就不会去假设这个变量的值了。【参考2】

 \EdkCompatibilityPkg\Foundation\Library\EdkIIGlueLib\Library\BaseMemoryLib\Ebc\SetMem.c

/**
  Set Buffer to Value for Size bytes.

  @param  Buffer Memory to set.
  @param  Size Number of bytes to set
  @param  Value Value of the set operation.

  @return Buffer

**/
VOID *
EFIAPI
InternalMemSetMem (
  IN      VOID                      *Buffer,
  IN      UINTN                     Size,
  IN      UINT8                     Value
  )
{
  //
  // Declare the local variables that actually move the data elements as
  // volatile to prevent the optimizer from replacing this function with
  // the intrinsic memset()
  //
  volatile UINT8                    *Pointer;

  Pointer = (UINT8*)Buffer;
  while (Size-- != 0) {
    *(Pointer++) = Value;
  }
  return Buffer;
}

5.汇编语言实现

\EdkCompatibilityPkg\Foundation\Library\CompilerStub\X64\memset.asm

\EdkCompatibilityPkg\Foundation\Library\CompilerStub\Ia32\memset.asm

IA32汇编的实现

    .686
    .model  flat,C
    .mmx
    .code

;------------------------------------------------------------------------------
;  VOID *
;  memset (
;    OUT VOID   *Buffer,
;    IN  UINT8  Value,
;    IN  UINTN  Count
;    )
;------------------------------------------------------------------------------
memset   PROC    USES    edi
    mov     al, [esp + 12]
    mov     ah, al
    shrd    edx, eax, 16
    shld    eax, edx, 16
    mov     ecx, [esp + 16]             ; ecx <- Count
    cmp     ecx, 0                      ; if Count == 0, do nothing
    je      @SetDone
    mov     edi, [esp + 8]              ; edi <- Buffer
    mov     edx, ecx
    and     edx, 7
    shr     ecx, 3                      ; # of Qwords to set
    jz      @SetBytes
    add     esp, -10h
    movq    [esp], mm0                  ; save mm0
    movq    [esp + 8], mm1              ; save mm1
    movd    mm0, eax
    movd    mm1, eax
    psllq   mm0, 32
    por     mm0, mm1                    ; fill mm0 with 8 Value's
@@:
    movq    [edi], mm0
    add     edi, 8
    loop    @B
    movq    mm0, [esp]                  ; restore mm0
    movq    mm1, [esp + 8]              ; restore mm1
    add     esp, 10h                    ; stack cleanup
@SetBytes:
    mov     ecx, edx
    rep     stosb
@SetDone:    
    mov     eax, [esp + 8]              ; eax <- Buffer as return value
    ret
memset   ENDP

    END

上面就是实现 SetMem 函数的基本方法,如果在 Porting 代码到 UEFI时遇到 MemSet 的错误,不妨试试直接将上面的代码搬迁到程序中。

参考:

  1. https://www.lab-z.com/stu136/  Step to UEFI (136)哪里来的的 memset 
  2. https://baike.baidu.com/item/volatile/10606957?fr=aladdin volatile

《Step to UEFI (176)memset的实现方法》有7个想法

  1. 用 volatile 修饰才是正确的方法,尤其是涉及嵌入式开发的有副作用的写入。参见 https://en.cppreference.com/w/c/language/volatile
    另外参见 https://stackoverflow.com/a/15618139/1190191

    第一个代码,我重写了一下:

    #include
    void *internal_memset(void *buffer, unsigned int len, uint16_t v)
    {
    for (; len!=0; --len)
    ((uint16_t *)buffer)[len] = v;
    return buffer;
    }

    在 Linux 上用 clang 开 -O2 -march=native 编译,反汇编发现它会用 SIMD 指令集 vectorization:


    0000000000000000 :
    #include

    void *internal_memset(void *buffer, unsigned int len, uint16_t v)
    {
    for (; len!=0; --len)
    0: 41 57 push r15
    2: 41 56 push r14
    4: 53 push rbx
    5: 85 f6 test esi,esi
    7: 0f 84 0f 01 00 00 je 11c
    d: 89 f0 mov eax,esi
    f: 44 8d 7e ff lea r15d,[rsi-0x1]
    13: 4d 8d 47 01 lea r8,[r15+0x1]
    17: 49 83 f8 10 cmp r8,0x10
    1b: 0f 82 ef 00 00 00 jb 110
    21: 49 bb f0 ff ff ff 01 movabs r11,0x1fffffff0
    28: 00 00 00
    2b: 4d 89 c1 mov r9,r8
    2e: 4d 21 d9 and r9,r11
    31: 4d 89 c2 mov r10,r8
    34: 4d 21 da and r10,r11
    37: 0f 84 d3 00 00 00 je 110
    3d: c5 f9 6e c2 vmovd xmm0,edx
    41: 4d 8d 77 01 lea r14,[r15+0x1]
    45: 4d 21 de and r14,r11
    48: 49 83 c6 f0 add r14,0xfffffffffffffff0
    4c: 44 89 f1 mov ecx,r14d
    4f: c1 e9 04 shr ecx,0x4
    52: 83 c1 01 add ecx,0x1
    55: 31 db xor ebx,ebx
    57: f6 c1 07 test cl,0x7
    5a: 74 36 je 92
    ((uint16_t *)buffer)[len] = v;
    5c: c4 e2 7d 79 c8 vpbroadcastw ymm1,xmm0
    61: 48 8d 4c 47 e2 lea rcx,[rdi+rax*2-0x1e]
    66: 83 e6 70 and esi,0x70
    69: 83 c6 f0 add esi,0xfffffff0
    6c: c1 ee 04 shr esi,0x4
    6f: 83 c6 01 add esi,0x1
    72: 83 e6 07 and esi,0x7
    75: 48 f7 de neg rsi
    78: 31 db xor ebx,ebx
    7a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
    80: c5 fe 7f 09 vmovdqu YMMWORD PTR [rcx],ymm1
    for (; len!=0; --len)
    84: 48 83 c3 10 add rbx,0x10
    88: 48 83 c1 e0 add rcx,0xffffffffffffffe0
    8c: 48 83 c6 01 add rsi,0x1
    90: 75 ee jne 80
    92: 49 83 fe 70 cmp r14,0x70
    96: 72 65 jb fd
    ((uint16_t *)buffer)[len] = v;
    98: c4 e2 7d 79 c0 vpbroadcastw ymm0,xmm0
    for (; len!=0; --len)
    9d: 49 83 c7 01 add r15,0x1
    a1: 4d 21 df and r15,r11
    a4: 49 29 df sub r15,rbx
    a7: 48 8d 4c 00 e2 lea rcx,[rax+rax*1-0x1e]
    ac: 48 01 db add rbx,rbx
    af: 48 29 d9 sub rcx,rbx
    b2: 48 01 f9 add rcx,rdi
    b5: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
    bc: 00 00 00 00
    ((uint16_t *)buffer)[len] = v;
    c0: c5 fe 7f 01 vmovdqu YMMWORD PTR [rcx],ymm0
    c4: c5 fe 7f 41 e0 vmovdqu YMMWORD PTR [rcx-0x20],ymm0
    c9: c5 fe 7f 41 c0 vmovdqu YMMWORD PTR [rcx-0x40],ymm0
    ce: c5 fe 7f 41 a0 vmovdqu YMMWORD PTR [rcx-0x60],ymm0
    d3: c5 fe 7f 41 80 vmovdqu YMMWORD PTR [rcx-0x80],ymm0
    d8: c5 fe 7f 81 60 ff ff vmovdqu YMMWORD PTR [rcx-0xa0],ymm0
    df: ff
    e0: c5 fe 7f 81 40 ff ff vmovdqu YMMWORD PTR [rcx-0xc0],ymm0
    e7: ff
    e8: c5 fe 7f 81 20 ff ff vmovdqu YMMWORD PTR [rcx-0xe0],ymm0
    ef: ff
    for (; len!=0; --len)
    f0: 48 81 c1 00 ff ff ff add rcx,0xffffffffffffff00
    f7: 49 83 c7 80 add r15,0xffffffffffffff80
    fb: 75 c3 jne c0
    fd: 4d 39 d0 cmp r8,r10
    100: 74 1a je 11c
    102: 4c 29 c8 sub rax,r9
    105: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
    10c: 00 00 00 00
    ((uint16_t *)buffer)[len] = v;
    110: 66 89 14 47 mov WORD PTR [rdi+rax*2],dx
    for (; len!=0; --len)
    114: 48 83 c0 ff add rax,0xffffffffffffffff
    118: 85 c0 test eax,eax
    11a: 75 f4 jne 110
    return buffer;
    11c: 48 89 f8 mov rax,rdi
    11f: 5b pop rbx
    120: 41 5e pop r14
    122: 41 5f pop r15
    124: c5 f8 77 vzeroupper
    127: c3 ret

    1. 代码嵌入混乱了。我不清楚这里怎么做是正确的。我习惯 StackOverflow 上用 markdown 的修饰符去嵌入代码。

      暂时用 pastebin 吧。

      C代码:https://pastebin.com/WiHWVzFZ
      反汇编结果:https://pastebin.com/SrWC660z

    2. 另外建议尽量参考 C 标准(比如 C89, C99, C11),不要依赖于编译器的行为,因为编译器一换或者升级就会发生变化,但符合标准规定的合格编译器是不会和标准冲突,尤其是开优化的时候,很可能 undefined behavior 会导致出乎意料的结果。

  2. 仔细想了下,你可能并不是想提示编译器不优化,而是提示编译器编译时不把手工代码优化成库函数调用,或者更一般的,不引入库函数调用。类似 gcc 的 -fno-tree-loop-distribute-patterns:https://stackoverflow.com/a/33818680/1190191

    msvc 好像没有类似的选项。此外 msvc 是 C++ 编译器,只是可以编译 C,但对标准的支持不理想:https://stackoverflow.com/a/48615212/1190191

    不调用库函数,gcc 也能优化循环。看上去那个手动用 8 bytes 的优化是不需要的(甚至我不知道这样做是不是可能干扰编译器优化)。

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注