之前的文章“哪里来的的 memset”【参考1】提到过因为编译器擅作主张使用memset优化引起了很诡异的问题。可以通过关闭编译优化来避免错误,这里从代码的角度分析 EDK2 是如何实现 memset 功能的。
- \MdePkg\Library\BaseMemoryLib\MemLibGeneric.c 提供了三个函数
InternalMemSetMem16
InternalMemSetMem32
InternalMemSetMem64
以 InternalMemSetMem16 为例:
/**
Fills a target buffer with a 16-bit value, and returns the target buffer.
@param Buffer The pointer to the target buffer to fill.
@param Length The count of 16-bit value to fill.
@param Value The value with which to fill Length bytes of Buffer.
@return Buffer
**/
VOID *
EFIAPI
InternalMemSetMem16 (
OUT VOID *Buffer,
IN UINTN Length,
IN UINT16 Value
)
{
for (; Length != 0; Length--) {
((UINT16*)Buffer)[Length - 1] = Value;
}
return Buffer;
}
看起来for (; Length != 0; Length–) 这样的定义足够“迷惑”编译器避免优化。
2. \MdePkg\Library\BaseMemoryLib\SetMem.c 提供了InternalMemSetMem()
/**
Set Buffer to Value for Size bytes.
@param Buffer The memory to set.
@param Length The number of bytes to set.
@param Value The value of the set operation.
@return Buffer
**/
VOID *
EFIAPI
InternalMemSetMem (
OUT VOID *Buffer,
IN UINTN Length,
IN UINT8 Value
)
{
//
// Declare the local variables that actually move the data elements as
// volatile to prevent the optimizer from replacing this function with
// the intrinsic memset()
//
volatile UINT8 *Pointer8;
volatile UINT32 *Pointer32;
volatile UINT64 *Pointer64;
UINT32 Value32;
UINT64 Value64;
if ((((UINTN)Buffer & 0x7) == 0) && (Length >= 8)) {
// Generate the 64bit value
Value32 = (Value << 24) | (Value << 16) | (Value << 8) | Value;
Value64 = LShiftU64 (Value32, 32) | Value32;
Pointer64 = (UINT64*)Buffer;
while (Length >= 8) {
*(Pointer64++) = Value64;
Length -= 8;
}
// Finish with bytes if needed
Pointer8 = (UINT8*)Pointer64;
} else if ((((UINTN)Buffer & 0x3) == 0) && (Length >= 4)) {
// Generate the 32bit value
Value32 = (Value << 24) | (Value << 16) | (Value << 8) | Value;
Pointer32 = (UINT32*)Buffer;
while (Length >= 4) {
*(Pointer32++) = Value32;
Length -= 4;
}
// Finish with bytes if needed
Pointer8 = (UINT8*)Pointer32;
} else {
Pointer8 = (UINT8*)Buffer;
}
while (Length-- > 0) {
*(Pointer8++) = Value;
}
return Buffer;
}
避免被编译器优化的方法和上面的类似,此外还可以看出这个函数特地用 8 bytes填充提升效率。
3. \MdePkg\Library\UefiMemoryLib\MemLib.c 中的InternalMemSetMem 函数直接调用 gBS 提供的服务
/**
Fills a target buffer with a byte value, and returns the target buffer.
This function wraps the gBS->SetMem().
@param Buffer Memory to set.
@param Size The number of bytes to set.
@param Value Value of the set operation.
@return Buffer.
**/
VOID *
EFIAPI
InternalMemSetMem (
OUT VOID *Buffer,
IN UINTN Size,
IN UINT8 Value
)
{
gBS->SetMem (Buffer, Size, Value);
return Buffer;
}
4. 通过volatile 申明变量避免编译器的优化,简单粗暴,很前面2提到的没有本质差别。volatile是一个类型修饰符(type specifier).volatile的作用是作为指令关键字,确保本条指令不会因编译器的优化而省略,且要求每次直接读值。volatile的变量是说这变量可能会被意想不到地改变,这样,编译器就不会去假设这个变量的值了。【参考2】
\EdkCompatibilityPkg\Foundation\Library\EdkIIGlueLib\Library\BaseMemoryLib\Ebc\SetMem.c
/**
Set Buffer to Value for Size bytes.
@param Buffer Memory to set.
@param Size Number of bytes to set
@param Value Value of the set operation.
@return Buffer
**/
VOID *
EFIAPI
InternalMemSetMem (
IN VOID *Buffer,
IN UINTN Size,
IN UINT8 Value
)
{
//
// Declare the local variables that actually move the data elements as
// volatile to prevent the optimizer from replacing this function with
// the intrinsic memset()
//
volatile UINT8 *Pointer;
Pointer = (UINT8*)Buffer;
while (Size-- != 0) {
*(Pointer++) = Value;
}
return Buffer;
}
5.汇编语言实现
\EdkCompatibilityPkg\Foundation\Library\CompilerStub\X64\memset.asm
\EdkCompatibilityPkg\Foundation\Library\CompilerStub\Ia32\memset.asm
IA32汇编的实现
.686
.model flat,C
.mmx
.code
;------------------------------------------------------------------------------
; VOID *
; memset (
; OUT VOID *Buffer,
; IN UINT8 Value,
; IN UINTN Count
; )
;------------------------------------------------------------------------------
memset PROC USES edi
mov al, [esp + 12]
mov ah, al
shrd edx, eax, 16
shld eax, edx, 16
mov ecx, [esp + 16] ; ecx <- Count
cmp ecx, 0 ; if Count == 0, do nothing
je @SetDone
mov edi, [esp + 8] ; edi <- Buffer
mov edx, ecx
and edx, 7
shr ecx, 3 ; # of Qwords to set
jz @SetBytes
add esp, -10h
movq [esp], mm0 ; save mm0
movq [esp + 8], mm1 ; save mm1
movd mm0, eax
movd mm1, eax
psllq mm0, 32
por mm0, mm1 ; fill mm0 with 8 Value's
@@:
movq [edi], mm0
add edi, 8
loop @B
movq mm0, [esp] ; restore mm0
movq mm1, [esp + 8] ; restore mm1
add esp, 10h ; stack cleanup
@SetBytes:
mov ecx, edx
rep stosb
@SetDone:
mov eax, [esp + 8] ; eax <- Buffer as return value
ret
memset ENDP
END
上面就是实现 SetMem 函数的基本方法,如果在 Porting 代码到 UEFI时遇到 MemSet 的错误,不妨试试直接将上面的代码搬迁到程序中。
参考:
- http://www.lab-z.com/stu136/ Step to UEFI (136)哪里来的的 memset
- https://baike.baidu.com/item/volatile/10606957?fr=aladdin volatile
用 volatile 修饰才是正确的方法,尤其是涉及嵌入式开发的有副作用的写入。参见 https://en.cppreference.com/w/c/language/volatile
另外参见 https://stackoverflow.com/a/15618139/1190191
第一个代码,我重写了一下:
#include
void *internal_memset(void *buffer, unsigned int len, uint16_t v)
{
for (; len!=0; --len)
((uint16_t *)buffer)[len] = v;
return buffer;
}
在 Linux 上用 clang 开 -O2 -march=native 编译,反汇编发现它会用 SIMD 指令集 vectorization:
0000000000000000 :
#include
void *internal_memset(void *buffer, unsigned int len, uint16_t v)
{
for (; len!=0; --len)
0: 41 57 push r15
2: 41 56 push r14
4: 53 push rbx
5: 85 f6 test esi,esi
7: 0f 84 0f 01 00 00 je 11c
d: 89 f0 mov eax,esi
f: 44 8d 7e ff lea r15d,[rsi-0x1]
13: 4d 8d 47 01 lea r8,[r15+0x1]
17: 49 83 f8 10 cmp r8,0x10
1b: 0f 82 ef 00 00 00 jb 110
21: 49 bb f0 ff ff ff 01 movabs r11,0x1fffffff0
28: 00 00 00
2b: 4d 89 c1 mov r9,r8
2e: 4d 21 d9 and r9,r11
31: 4d 89 c2 mov r10,r8
34: 4d 21 da and r10,r11
37: 0f 84 d3 00 00 00 je 110
3d: c5 f9 6e c2 vmovd xmm0,edx
41: 4d 8d 77 01 lea r14,[r15+0x1]
45: 4d 21 de and r14,r11
48: 49 83 c6 f0 add r14,0xfffffffffffffff0
4c: 44 89 f1 mov ecx,r14d
4f: c1 e9 04 shr ecx,0x4
52: 83 c1 01 add ecx,0x1
55: 31 db xor ebx,ebx
57: f6 c1 07 test cl,0x7
5a: 74 36 je 92
((uint16_t *)buffer)[len] = v;
5c: c4 e2 7d 79 c8 vpbroadcastw ymm1,xmm0
61: 48 8d 4c 47 e2 lea rcx,[rdi+rax*2-0x1e]
66: 83 e6 70 and esi,0x70
69: 83 c6 f0 add esi,0xfffffff0
6c: c1 ee 04 shr esi,0x4
6f: 83 c6 01 add esi,0x1
72: 83 e6 07 and esi,0x7
75: 48 f7 de neg rsi
78: 31 db xor ebx,ebx
7a: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
80: c5 fe 7f 09 vmovdqu YMMWORD PTR [rcx],ymm1
for (; len!=0; --len)
84: 48 83 c3 10 add rbx,0x10
88: 48 83 c1 e0 add rcx,0xffffffffffffffe0
8c: 48 83 c6 01 add rsi,0x1
90: 75 ee jne 80
92: 49 83 fe 70 cmp r14,0x70
96: 72 65 jb fd
((uint16_t *)buffer)[len] = v;
98: c4 e2 7d 79 c0 vpbroadcastw ymm0,xmm0
for (; len!=0; --len)
9d: 49 83 c7 01 add r15,0x1
a1: 4d 21 df and r15,r11
a4: 49 29 df sub r15,rbx
a7: 48 8d 4c 00 e2 lea rcx,[rax+rax*1-0x1e]
ac: 48 01 db add rbx,rbx
af: 48 29 d9 sub rcx,rbx
b2: 48 01 f9 add rcx,rdi
b5: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
bc: 00 00 00 00
((uint16_t *)buffer)[len] = v;
c0: c5 fe 7f 01 vmovdqu YMMWORD PTR [rcx],ymm0
c4: c5 fe 7f 41 e0 vmovdqu YMMWORD PTR [rcx-0x20],ymm0
c9: c5 fe 7f 41 c0 vmovdqu YMMWORD PTR [rcx-0x40],ymm0
ce: c5 fe 7f 41 a0 vmovdqu YMMWORD PTR [rcx-0x60],ymm0
d3: c5 fe 7f 41 80 vmovdqu YMMWORD PTR [rcx-0x80],ymm0
d8: c5 fe 7f 81 60 ff ff vmovdqu YMMWORD PTR [rcx-0xa0],ymm0
df: ff
e0: c5 fe 7f 81 40 ff ff vmovdqu YMMWORD PTR [rcx-0xc0],ymm0
e7: ff
e8: c5 fe 7f 81 20 ff ff vmovdqu YMMWORD PTR [rcx-0xe0],ymm0
ef: ff
for (; len!=0; --len)
f0: 48 81 c1 00 ff ff ff add rcx,0xffffffffffffff00
f7: 49 83 c7 80 add r15,0xffffffffffffff80
fb: 75 c3 jne c0
fd: 4d 39 d0 cmp r8,r10
100: 74 1a je 11c
102: 4c 29 c8 sub rax,r9
105: 66 66 2e 0f 1f 84 00 data16 nop WORD PTR cs:[rax+rax*1+0x0]
10c: 00 00 00 00
((uint16_t *)buffer)[len] = v;
110: 66 89 14 47 mov WORD PTR [rdi+rax*2],dx
for (; len!=0; --len)
114: 48 83 c0 ff add rax,0xffffffffffffffff
118: 85 c0 test eax,eax
11a: 75 f4 jne 110
return buffer;
11c: 48 89 f8 mov rax,rdi
11f: 5b pop rbx
120: 41 5e pop r14
122: 41 5f pop r15
124: c5 f8 77 vzeroupper
127: c3 ret
代码嵌入混乱了。我不清楚这里怎么做是正确的。我习惯 StackOverflow 上用 markdown 的修饰符去嵌入代码。
暂时用 pastebin 吧。
C代码:https://pastebin.com/WiHWVzFZ
反汇编结果:https://pastebin.com/SrWC660z
我直接用的 WordPress 的高亮功能,没哟难过 MarkDown.
另外建议尽量参考 C 标准(比如 C89, C99, C11),不要依赖于编译器的行为,因为编译器一换或者升级就会发生变化,但符合标准规定的合格编译器是不会和标准冲突,尤其是开优化的时候,很可能 undefined behavior 会导致出乎意料的结果。
仔细想了下,你可能并不是想提示编译器不优化,而是提示编译器编译时不把手工代码优化成库函数调用,或者更一般的,不引入库函数调用。类似 gcc 的 -fno-tree-loop-distribute-patterns:https://stackoverflow.com/a/33818680/1190191
msvc 好像没有类似的选项。此外 msvc 是 C++ 编译器,只是可以编译 C,但对标准的支持不理想:https://stackoverflow.com/a/48615212/1190191
不调用库函数,gcc 也能优化循环。看上去那个手动用 8 bytes 的优化是不需要的(甚至我不知道这样做是不是可能干扰编译器优化)。
博主,你好,我想请教下有没有什么方法可以在UEFI下获取空闲内存呢
你是想获得当前系统内存分配情况? 参考 https://www.lab-z.com/revmem/