一个向量化的编译器可以通过高效的使用NEON硬件单元来并行化C/C++代码。C语言没有指定并行行为的语法,但可以给编译器提供各种暗示。
对于指针变量可以在定义时加上 "__restrict" 关键字,但得保证它指向的这片内存区域不能由外部任何指针来引用和修改(This has the effect of guaranteeing that pointers will not address overlapping regions of memory)。
循环次数明确为4或者8的倍数,也会被编译器利用进行优化。
当编译代码时,加上选项? -O1 -ftree-vectorize -mfpu=neon。
示例代码:
#include <stdio.h>
int main(int argc, char *argv[])
{
int arr[8] = {1,2,3,4,5,6,7,8};
for(int i = 0; i < 8; i++) {
arr[i] *= 3;
}
for(int i = 0; i < 8; i++) {
printf("%d\n", arr[i]);
}
return 0;
}
反汇编(V开头的指令即NEON指令):
00010408 <main>:
10408: e92d4070 push {r4, r5, r6, lr}
1040c: e24dd020 sub sp, sp, #32
10410: e1a0c00d mov ip, sp
10414: e59fe05c ldr lr, [pc, #92] ; 10478 <main+0x70>
10418: e8be000f ldm lr!, {r0, r1, r2, r3}
1041c: e8ac000f stmia ip!, {r0, r1, r2, r3}
10420: e89e000f ldm lr, {r0, r1, r2, r3}
10424: e88c000f stm ip, {r0, r1, r2, r3}
10428: f2c04053 vmov.i32 q10, #3 ; 0x00000003
1042c: f46d2adf vld1.64 {d18-d19}, [sp :64]
10430: f26229f4 vmul.i32 q9, q9, q10
10434: f44d2adf vst1.64 {d18-d19}, [sp :64]
10438: eddd0b04 vldr d16, [sp, #16]
1043c: eddd1b06 vldr d17, [sp, #24]
10440: f26009f4 vmul.i32 q8, q8, q10
10444: edcd0b04 vstr d16, [sp, #16]
10448: edcd1b06 vstr d17, [sp, #24]
1044c: e1a0400d mov r4, sp
10450: e28d6020 add r6, sp, #32
10454: e59f5020 ldr r5, [pc, #32] ; 1047c <main+0x74>
10458: e4941004 ldr r1, [r4], #4
1045c: e1a00005 mov r0, r5
10460: ebffffa0 bl 102e8 <printf@plt>
10464: e1560004 cmp r6, r4
10468: 1afffffa bne 10458 <main+0x50>
1046c: e3a00000 mov r0, #0
10470: e28dd020 add sp, sp, #32
10474: e8bd8070 pop {r4, r5, r6, pc}
10478: 000104f0 .word 0x000104f0
1047c: 00010510 .word 0x00010510
|