Step to UEFI (176）memset的实现方法

之前的文章“哪里来的的 memset”【参考1】提到过因为编译器擅作主张使用memset优化引起了很诡异的问题。可以通过关闭编译优化来避免错误，这里从代码的角度分析 EDK2 是如何实现 memset 功能的。

\MdePkg\Library\BaseMemoryLib\MemLibGeneric.c 提供了三个函数

InternalMemSetMem16

InternalMemSetMem32

InternalMemSetMem64

以 InternalMemSetMem16 为例：

/**
  Fills a target buffer with a 16-bit value, and returns the target buffer.

  @param  Buffer  The pointer to the target buffer to fill.
  @param  Length  The count of 16-bit value to fill.
  @param  Value   The value with which to fill Length bytes of Buffer.

  @return Buffer

**/
VOID *
EFIAPI
InternalMemSetMem16 (
  OUT     VOID                      *Buffer,
  IN      UINTN                     Length,
  IN      UINT16                    Value
  )
{
  for (; Length != 0; Length--) {
    ((UINT16*)Buffer)[Length - 1] = Value;
  }
  return Buffer;
}

看起来for (; Length != 0; Length–) 这样的定义足够“迷惑”编译器避免优化。

2. \MdePkg\Library\BaseMemoryLib\SetMem.c 提供了InternalMemSetMem()

/**
  Set Buffer to Value for Size bytes.

  @param  Buffer   The memory to set.
  @param  Length   The number of bytes to set.
  @param  Value    The value of the set operation.

  @return Buffer

**/
VOID *
EFIAPI
InternalMemSetMem (
  OUT     VOID                      *Buffer,
  IN      UINTN                     Length,
  IN      UINT8                     Value
  )
{
  //
  // Declare the local variables that actually move the data elements as
  // volatile to prevent the optimizer from replacing this function with
  // the intrinsic memset()
  //
  volatile UINT8                    *Pointer8;
  volatile UINT32                   *Pointer32;
  volatile UINT64                   *Pointer64;
  UINT32                            Value32;
  UINT64                            Value64;

  if ((((UINTN)Buffer & 0x7) == 0) && (Length >= 8)) {
    // Generate the 64bit value
    Value32 = (Value << 24) | (Value << 16) | (Value << 8) | Value;
    Value64 = LShiftU64 (Value32, 32) | Value32;

    Pointer64 = (UINT64*)Buffer;
    while (Length >= 8) {
      *(Pointer64++) = Value64;
      Length -= 8;
    }

    // Finish with bytes if needed
    Pointer8 = (UINT8*)Pointer64;
  } else if ((((UINTN)Buffer & 0x3) == 0) && (Length >= 4)) {
    // Generate the 32bit value
    Value32 = (Value << 24) | (Value << 16) | (Value << 8) | Value;

    Pointer32 = (UINT32*)Buffer;
    while (Length >= 4) {
      *(Pointer32++) = Value32;
      Length -= 4;
    }

    // Finish with bytes if needed
    Pointer8 = (UINT8*)Pointer32;
  } else {
    Pointer8 = (UINT8*)Buffer;
  }
  while (Length-- > 0) {
    *(Pointer8++) = Value;
  }
  return Buffer;
}

避免被编译器优化的方法和上面的类似，此外还可以看出这个函数特地用 8 bytes填充提升效率。

3. \MdePkg\Library\UefiMemoryLib\MemLib.c 中的InternalMemSetMem 函数直接调用 gBS 提供的服务

/**
  Fills a target buffer with a byte value, and returns the target buffer.

  This function wraps the gBS->SetMem().

  @param  Buffer    Memory to set.
  @param  Size      The number of bytes to set.
  @param  Value     Value of the set operation.

  @return Buffer.

**/
VOID *
EFIAPI
InternalMemSetMem (
  OUT     VOID                      *Buffer,
  IN      UINTN                     Size,
  IN      UINT8                     Value
  )
{
  gBS->SetMem (Buffer, Size, Value);
  return Buffer;
}

4. 通过volatile 申明变量避免编译器的优化，简单粗暴，很前面2提到的没有本质差别。volatile是一个类型修饰符（type specifier）.volatile的作用是作为指令关键字，确保本条指令不会因编译器的优化而省略，且要求每次直接读值。volatile的变量是说这变量可能会被意想不到地改变，这样，编译器就不会去假设这个变量的值了。【参考2】

\EdkCompatibilityPkg\Foundation\Library\EdkIIGlueLib\Library\BaseMemoryLib\Ebc\SetMem.c

/**
  Set Buffer to Value for Size bytes.

  @param  Buffer Memory to set.
  @param  Size Number of bytes to set
  @param  Value Value of the set operation.

  @return Buffer

**/
VOID *
EFIAPI
InternalMemSetMem (
  IN      VOID                      *Buffer,
  IN      UINTN                     Size,
  IN      UINT8                     Value
  )
{
  //
  // Declare the local variables that actually move the data elements as
  // volatile to prevent the optimizer from replacing this function with
  // the intrinsic memset()
  //
  volatile UINT8                    *Pointer;

  Pointer = (UINT8*)Buffer;
  while (Size-- != 0) {
    *(Pointer++) = Value;
  }
  return Buffer;
}

5.汇编语言实现

\EdkCompatibilityPkg\Foundation\Library\CompilerStub\X64\memset.asm

\EdkCompatibilityPkg\Foundation\Library\CompilerStub\Ia32\memset.asm

IA32汇编的实现

    .686
    .model  flat,C
    .mmx
    .code

;------------------------------------------------------------------------------
;  VOID *
;  memset (
;    OUT VOID   *Buffer,
;    IN  UINT8  Value,
;    IN  UINTN  Count
;    )
;------------------------------------------------------------------------------
memset   PROC    USES    edi
    mov     al, [esp + 12]
    mov     ah, al
    shrd    edx, eax, 16
    shld    eax, edx, 16
    mov     ecx, [esp + 16]             ; ecx <- Count
    cmp     ecx, 0                      ; if Count == 0, do nothing
    je      @SetDone
    mov     edi, [esp + 8]              ; edi <- Buffer
    mov     edx, ecx
    and     edx, 7
    shr     ecx, 3                      ; # of Qwords to set
    jz      @SetBytes
    add     esp, -10h
    movq    [esp], mm0                  ; save mm0
    movq    [esp + 8], mm1              ; save mm1
    movd    mm0, eax
    movd    mm1, eax
    psllq   mm0, 32
    por     mm0, mm1                    ; fill mm0 with 8 Value's
@@:
    movq    [edi], mm0
    add     edi, 8
    loop    @B
    movq    mm0, [esp]                  ; restore mm0
    movq    mm1, [esp + 8]              ; restore mm1
    add     esp, 10h                    ; stack cleanup
@SetBytes:
    mov     ecx, edx
    rep     stosb
@SetDone:    
    mov     eax, [esp + 8]              ; eax <- Buffer as return value
    ret
memset   ENDP

    END

上面就是实现 SetMem 函数的基本方法，如果在 Porting 代码到 UEFI时遇到 MemSet 的错误，不妨试试直接将上面的代码搬迁到程序中。

参考：

http://www.lab-z.com/stu136/ Step to UEFI (136）哪里来的的 memset
https://baike.baidu.com/item/volatile/10606957?fr=aladdin volatile

《Step to UEFI (176）memset的实现方法》有7个想法

用 volatile 修饰才是正确的方法，尤其是涉及嵌入式开发的有副作用的写入。参见 https://en.cppreference.com/w/c/language/volatile
另外参见 https://stackoverflow.com/a/15618139/1190191

第一个代码，我重写了一下：
#include void *internal_memset(void *buffer, unsigned int len, uint16_t v) { for (; len!=0; --len) ((uint16_t *)buffer)[len] = v; return buffer; }

在 Linux 上用 clang 开 -O2 -march=native 编译，反汇编发现它会用 SIMD 指令集 vectorization：

0000000000000000 : #include

void *internal_memset(void *buffer, unsigned int len, uint16_t v) { for (; len!=0; --len) 0: 41 57 push r15 2: 41 56 push r14 4: 53 push rbx 5: 85 f6 test esi,esi 7: 0f 84 0f 01 00 00 je 11c d: 89 f0 mov eax,esi f: 44 8d 7e ff lea r15d,[rsi-0x1] 13: 4d 8d 47 01 lea r8,[r15+0x1] 17: 49 83 f8 10 cmp r8,0x10 1b: 0f 82 ef 00 00 00 jb 110 21: 49 bb f0 ff ff ff 01 movabs r11,0x1fffffff0 28: 00 00 00 2b: 4d 89 c1 mov r9,r8 2e: 4d 21 d9 and r9,r11 31: 4d 89 c2 mov r10,r8 34: 4d 21 da and r10,r11 37: 0f 84 d3 00 00 00 je 110 3d: c5 f9 6e c2 vmovd xmm0,edx 41: 4d 8d 77 01 lea r14,[r15+0x1] 45: 4d 21 de and r14,r11 48: 49 83 c6 f0 add r14,0xfffffffffffffff0 4c: 44 89 f1 mov ecx,r14d 4f: c1 e9 04 shr ecx,0x4 52: 83 c1 01 add ecx,0x1 55: 31 db xor ebx,ebx 57: f6 c1 07 test cl,0x7 5a: 74 36 je 92 ((uint16_t *)buffer)[len] = v; 5c: c4 e2 7d 79 c8 vpbroadcastw ymm1,xmm0 61: 48 8d 4c 47 e2 lea rcx,[rdi+rax*2-0x1e] 66: 83 e6 70 and esi,0x70 69: 83 c6 f0 add esi,0xfffffff0 6c: c1 ee 04 shr esi,0x4 6f: 83 c6 01 add esi,0x1 72: 83 e6 07 and esi,0x7 75: 48 f7 de neg rsi 78: 31 db xor ebx,ebx 7a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0] 80: c5 fe 7f 09 vmovdqu YMMWORD PTR [rcx],ymm1 for (; len!=0; --len) 84: 48 83 c3 10 add rbx,0x10 88: 48 83 c1 e0 add rcx,0xffffffffffffffe0 8c: 48 83 c6 01 add rsi,0x1 90: 75 ee jne 80 92: 49 83 fe 70 cmp r14,0x70 96: 72 65 jb fd ((uint16_t *)buffer)[len] = v; 98: c4 e2 7d 79 c0 vpbroadcastw ymm0,xmm0 for (; len!=0; --len) 9d: 49 83 c7 01 add r15,0x1 a1: 4d 21 df and r15,r11 a4: 49 29 df sub r15,rbx a7: 48 8d 4c 00 e2 lea rcx,[rax+rax*1-0x1e] ac: 48 01 db add rbx,rbx af: 48 29 d9 sub rcx,rbx b2: 48 01 f9 add rcx,rdi b5: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0] bc: 00 00 00 00 ((uint16_t *)buffer)[len] = v; c0: c5 fe 7f 01 vmovdqu YMMWORD PTR [rcx],ymm0 c4: c5 fe 7f 41 e0 vmovdqu YMMWORD PTR [rcx-0x20],ymm0 c9: c5 fe 7f 41 c0 vmovdqu YMMWORD PTR [rcx-0x40],ymm0 ce: c5 fe 7f 41 a0 vmovdqu YMMWORD PTR [rcx-0x60],ymm0 d3: c5 fe 7f 41 80 vmovdqu YMMWORD PTR [rcx-0x80],ymm0 d8: c5 fe 7f 81 60 ff ff vmovdqu YMMWORD PTR [rcx-0xa0],ymm0 df: ff e0: c5 fe 7f 81 40 ff ff vmovdqu YMMWORD PTR [rcx-0xc0],ymm0 e7: ff e8: c5 fe 7f 81 20 ff ff vmovdqu YMMWORD PTR [rcx-0xe0],ymm0 ef: ff for (; len!=0; --len) f0: 48 81 c1 00 ff ff ff add rcx,0xffffffffffffff00 f7: 49 83 c7 80 add r15,0xffffffffffffff80 fb: 75 c3 jne c0 fd: 4d 39 d0 cmp r8,r10 100: 74 1a je 11c 102: 4c 29 c8 sub rax,r9 105: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0] 10c: 00 00 00 00 ((uint16_t *)buffer)[len] = v; 110: 66 89 14 47 mov WORD PTR [rdi+rax*2],dx for (; len!=0; --len) 114: 48 83 c0 ff add rax,0xffffffffffffffff 118: 85 c0 test eax,eax 11a: 75 f4 jne 110 return buffer; 11c: 48 89 f8 mov rax,rdi 11f: 5b pop rbx 120: 41 5e pop r14 122: 41 5f pop r15 124: c5 f8 77 vzeroupper 127: c3 ret

UeThi5ne说道：

2019年6月14日下午 5:04

代码嵌入混乱了。我不清楚这里怎么做是正确的。我习惯 StackOverflow 上用 markdown 的修饰符去嵌入代码。

暂时用 pastebin 吧。

C代码：https://pastebin.com/WiHWVzFZ
反汇编结果：https://pastebin.com/SrWC660z

回复
1. ziv2013说道：
  
  2019年6月17日下午 2:11
  
  我直接用的 WordPress 的高亮功能，没哟难过 MarkDown.
  
  回复
laiWei8b说道：

2019年6月14日下午 5:08

另外建议尽量参考 C 标准（比如 C89, C99, C11），不要依赖于编译器的行为，因为编译器一换或者升级就会发生变化，但符合标准规定的合格编译器是不会和标准冲突，尤其是开优化的时候，很可能 undefined behavior 会导致出乎意料的结果。

回复

仔细想了下，你可能并不是想提示编译器不优化，而是提示编译器编译时不把手工代码优化成库函数调用，或者更一般的，不引入库函数调用。类似 gcc 的 -fno-tree-loop-distribute-patterns：https://stackoverflow.com/a/33818680/1190191

msvc 好像没有类似的选项。此外 msvc 是 C++ 编译器，只是可以编译 C，但对标准的支持不理想：https://stackoverflow.com/a/48615212/1190191

不调用库函数，gcc 也能优化循环。看上去那个手动用 8 bytes 的优化是不需要的（甚至我不知道这样做是不是可能干扰编译器优化）。

博主，你好，我想请教下有没有什么方法可以在UEFI下获取空闲内存呢

ziv2013说道：

2020年1月14日上午 8:50

你是想获得当前系统内存分配情况？参考 https://www.lab-z.com/revmem/

回复

《Step to UEFI (176）memset的实现方法》有7个想法

发表回复 取消回复

发表回复取消回复