1.4K Star 7.6K Fork 1.4K

GVP方舟编译器 / OpenArkCompiler

 / 详情

【spec性能分析】ivopt优化

待办的
成员
创建于  
2021-05-24 17:17
  • 源码:spec 525 pixel.c 函数:x264_pixel_satd_8x4

maple iv相关生成代码:

 754   mov w0, #0
 .L.36__2:
 ...
 821   add x12, x29, #16   <====== "tmp base地址 冗余"
 822   sxtw  x13, w0
 823   lsl x13, x13, #2  <===== "tmp[i] 地址 单循环内不变"
 824   str w11, [x12,x13,LSL #2]  <===== "tmp[i][0]"
 825   sub w9, w9, w10
 826   add x10, x29, #16  <====== "tmp base地址  冗余"
 827   sxtw  x11, w0
 828   lsl x11, x11, #2  <===== "tmp[i] 地址 单循环内不变 冗余"
 829   add x11, x11, #2
 830   str w9, [x10,x11,LSL #2]      <===== "tmp[i][2]"
...
 847   add w0, w0, #1  <==== "w0是i"
 848   add x3, x3, x1  
 849   add x4, x4, x2
 850   cmp w0, #4
 851   blt .L.36__2

gcc iv相关代码:

sxtw   x1, w1
sxtw   x3, w3
mov    x6, sp
add    x11, sp, #0x40  <===== "x11为tmp[4][4] 地址"
mov    x7, x6  <========  "x7为tmp起始地址"
18:
...
add    x0, x0, x1
add    x2, x2, x3
str    w8, [x7]   <=====  tmp[i][0]
str    w10, [x7, #8] <=====  tmp[i][2]
add    w8, w4, w5
sub    w4, w4, w5
str    w8, [x7, #4] <=====  tmp[i][1]
str    w4, [x7, #12] <=====  tmp[i][3]
add    x7, x7, #0x10   <====== " tmp[i]地址,每次+16,作为iv"
cmp    x11, x7   
b.ne   18

希望经过LFO或者strength reduction优化后可以变成gcc样子的代码.

另外,除去ivopt相关优化的因素,仅就当前pre优化的能力
4次add x12, x29, #16 <====== "tmp base地址 冗余" &&
后3次的lsl x11, x11, #2 <===== "tmp[i] 地址 单循环内不变 冗余"
都是冗余计算,应该可以删除,但是没有。我们分析认为是中端IR
array 1 a64 <* <[4] <[4] u32>>> (addrof u64 %tmp, regread i32 %82, constval i32 0)
应该在合适时机打散成普通运算而不是一个整体。

评论 (2)

Leo Young 创建了任务
Leo Young 修改了描述
Leo Young 修改了描述
Leo Young 修改了标题
Leo Young 修改了标题
Leo Young 修改了描述
展开全部操作日志

本地开启SR + LFRT + PR712后仍有正确性问题,但是通过看SPEC 525 dct.c中的sub4x4_dct函数汇编,可以发现循环内冗余代码有所缓解,但是仍有较大进步空间。
主要在于:

  1. 循环外提的代码过多,且存在大量冗余
  2. 循环内IV选取有更合理的选择,且IV数量过多,可以合并

例如:

    int16_t d[16];
    int16_t tmp[16];
    for( int i = 0; i < 4; i++ )
    {
        int s03 = d[i*4+0] + d[i*4+3];
        int s12 = d[i*4+1] + d[i*4+2];
        int d03 = d[i*4+0] - d[i*4+3];
        int d12 = d[i*4+1] - d[i*4+2];

        tmp[0*4+i] =   s03 +   s12;
        tmp[1*4+i] = 2*d03 +   d12;
        tmp[2*4+i] =   s03 -   s12;
        tmp[3*4+i] =   d03 - 2*d12;
    }

对应me.mpl

 417   regassign i32 %105 (constval i32 0)
 418   regassign i32 %45 (mul i32 (regread i32 %105, constval i32 4))  # 0
 419   regassign i32 %46 (add i32 (regread i32 %45, constval i32 2))   # 2
 420   regassign a64 %47 (cvt a64 i32 (regread i32 %46))  # 2
 421   regassign a64 %48 (mul a64 (regread a64 %47, constval a64 2))   # 4
 422   regassign a64 %49 (add a64 (addrof u64 %d, regread a64 %48))  # d + 4
 423   regassign a64 %50 (add a64 (
 424       cvt a64 u64 (addrof u64 %d),
 425       constval a64 36))
 426   regassign i32 %52 (add i32 (regread i32 %45, constval i32 1))  # 1
 427   regassign a64 %53 (cvt a64 i32 (regread i32 %52))  # 1
 428   regassign a64 %54 (mul a64 (regread a64 %53, constval a64 2))  # 2
 429   regassign a64 %55 (add a64 (addrof u64 %d, regread a64 %54))  # d + 2
 430   regassign i32 %60 (add i32 (regread i32 %45, constval i32 3))  # 3
 431   regassign a64 %61 (cvt a64 i32 (regread i32 %60))  # 3
 432   regassign a64 %62 (mul a64 (regread a64 %61, constval a64 2)) # 6
 433   regassign a64 %63 (add a64 (addrof u64 %d, regread a64 %62))  # d + 6
 434   regassign a64 %66 (cvt a64 i32 (regread i32 %45))  # 0
 435   regassign a64 %67 (mul a64 (regread a64 %66, constval a64 2)) # 0
 436   regassign a64 %68 (add a64 (addrof u64 %d, regread a64 %67))  # d + 0
 437   regassign a64 %73 (cvt a64 i32 (regread i32 %105))  # 0
 438   regassign a64 %74 (mul a64 (regread a64 %73, constval a64 2)) # 0
 439   regassign a64 %75 (add a64 (addrof u64 %tmp, regread a64 %74)) # tmp + 0
 440   regassign i32 %77 (add i32 (regread i32 %105, constval i32 4)) # 4
 441   regassign a64 %78 (cvt a64 i32 (regread i32 %77)) # 4
 442   regassign a64 %79 (mul a64 (regread a64 %78, constval a64 2)) # 8
 443   regassign a64 %80 (add a64 (addrof u64 %tmp, regread a64 %79)) # tmp + 8
 444   regassign i32 %82 (add i32 (regread i32 %105, constval i32 8)) # 8
 445   regassign a64 %83 (cvt a64 i32 (regread i32 %82)) # 8
 446   regassign a64 %84 (mul a64 (regread a64 %83, constval a64 2)) # 16
 447   regassign a64 %85 (add a64 (addrof u64 %tmp, regread a64 %84)) # tmp + 16
 448   regassign i32 %87 (add i32 (regread i32 %105, constval i32 12)) # 12
 449   regassign a64 %88 (cvt a64 i32 (regread i32 %87)) # 12
 450   regassign a64 %89 (mul a64 (regread a64 %88, constval a64 2)) # 24
 451   regassign a64 %90 (add a64 (addrof u64 %tmp, regread a64 %89)) # tmp + 24
 452 @@7   regassign i32 %51 (iread i32 <* i16> 0 (regread a64 %49))
 453   regassign i32 %57 (iread i32 <* i16> 0 (regread a64 %55))
 454   regassign i32 %59 (add i32 (regread i32 %57, regread i32 %51))
 455   regassign i32 %65 (iread i32 <* i16> 0 (regread a64 %63))
 456   regassign i32 %70 (iread i32 <* i16> 0 (regread a64 %68))
 457   regassign i32 %72 (add i32 (regread i32 %70, regread i32 %65))
 458 LOC 2 126
 459   iassign <* i16> 0 (
 460     regread a64 %75,
 461     sext i32 16 (add i32 (regread i32 %72, regread i32 %59)))
 462   regassign i32 %58 (sub i32 (regread i32 %57, regread i32 %51))
 463   regassign i32 %71 (sub i32 (regread i32 %70, regread i32 %65))
 464 LOC 2 127
 465   iassign <* i16> 0 (
 466     regread a64 %80,
 467     sext i32 16 (add i32 (
 468       regread i32 %58,
 469       mul i32 (regread i32 %71, constval i32 2))))
 470 LOC 2 128
 471   iassign <* i16> 0 (
 472     regread a64 %85,
 473     sext i32 16 (sub i32 (regread i32 %72, regread i32 %59)))
 474 LOC 2 129
 475   iassign <* i16> 0 (
 476     regread a64 %90,
 477     sext i32 16 (sub i32 (
 478       regread i32 %71,
 479       mul i32 (regread i32 %58, constval i32 2))))
 480   regassign a64 %90 (add a64 (regread a64 %90, constval a64 2))
 481   regassign a64 %85 (add a64 (regread a64 %85, constval a64 2))
 482   regassign a64 %80 (add a64 (regread a64 %80, constval a64 2))
 483   regassign a64 %75 (add a64 (regread a64 %75, constval a64 2))
 484   regassign a64 %68 (add a64 (regread a64 %68, constval a64 8))
 485   regassign a64 %63 (add a64 (regread a64 %63, constval a64 8))
 486   regassign a64 %55 (add a64 (regread a64 %55, constval a64 8))
 487   regassign a64 %49 (add a64 (regread a64 %49, constval a64 8))
 488   brtrue @@7 (lt i32 a64 (regread a64 %49, regread a64 %50))

可以看出:
1.我们有8个IV,数量过多,理论上只需2个即可,%49,%55,%63均可用%68 + const替换
2.我们选取%49(addrof %d + 4)作为最后的IV,不够理想,应该选用%68(addrof %d + 0)
3.循环外417-451存在大量冗余 #已注释出prop+folding后的结果

PR720 已解决以上问题。 现在的me 会输出:

func &foo () void {
  funcid 1

LOC 2 10
  regassign i32 %20 (constval i32 0)
  regassign i32 %1 (mul i32 (regread i32 %20, constval i32 4))
  regassign a64 %2 (cvt a64 i32 (regread i32 %1))
  regassign a64 %3 (mul a64 (regread a64 %2, constval a64 2))
  regassign a64 %17 (cvt a64 i32 (regread i32 %20))
  regassign a64 %18 (mul a64 (regread a64 %17, constval a64 2))
  regassign a64 %26 (addrof a64 $tmp)
  regassign a64 %27 (addrof a64 $d)
@@2   regassign a64 %4 (add a64 (regread a64 %27, regread a64 %3))
  regassign i32 %5 (iread i32 <* i16> 0 (add a64 (regread a64 %4, constval a64 4)))
  regassign i32 %6 (cvt i32 i16 (regread i32 %5))
  regassign i32 %7 (iread i32 <* i16> 0 (add a64 (regread a64 %4, constval a64 2)))
  regassign i32 %8 (cvt i32 i16 (regread i32 %7))
  regassign i32 %10 (add i32 (regread i32 %8, regread i32 %6))
  regassign i32 %11 (iread i32 <* i16> 0 (add a64 (regread a64 %4, constval a64 6)))
  regassign i32 %12 (cvt i32 i16 (regread i32 %11))
  regassign i32 %13 (iread i32 <* i16> 0 (regread a64 %4))
  regassign i32 %14 (cvt i32 i16 (regread i32 %13))
  regassign i32 %16 (add i32 (regread i32 %14, regread i32 %12))
  regassign a64 %19 (add a64 (regread a64 %26, regread a64 %18))
LOC 2 12
  iassign <* i16> 0 (
    regread a64 %19, 
    cvt i16 i32 (add i32 (regread i32 %16, regread i32 %10)))
  regassign i32 %9 (sub i32 (regread i32 %8, regread i32 %6))
  regassign i32 %15 (sub i32 (regread i32 %14, regread i32 %12))
LOC 2 13
  iassign <* i16> 0 (
    add a64 (regread a64 %19, constval a64 8), 
    cvt i16 i32 (add i32 (
      mul i32 (regread i32 %15, constval i32 2),
      regread i32 %9)))
LOC 2 14
  iassign <* i16> 0 (
    add a64 (regread a64 %19, constval a64 16), 
    cvt i16 i32 (sub i32 (regread i32 %16, regread i32 %10)))
LOC 2 15
  iassign <* i16> 0 (
    add a64 (regread a64 %19, constval a64 24), 
    cvt i16 i32 (sub i32 (
      regread i32 %15,
      mul i32 (regread i32 %9, constval i32 2))))
  regassign a64 %18 (add a64 (regread a64 %18, constval a64 2))
  regassign a64 %3 (add a64 (regread a64 %3, constval a64 8))
  brtrue @@2 (lt i32 i32 (regread a64 %3, constval a64 32))
@@1 LOC 2 5
  return ()
}

登录 后才可以发表评论

状态
负责人
里程碑
Pull Requests
关联的 Pull Requests 被合并后可能会关闭此 issue
分支
开始日期   -   截止日期
-
置顶选项
优先级
参与者(2)
C++
1
https://gitee.com/openarkcompiler/OpenArkCompiler.git
git@gitee.com:openarkcompiler/OpenArkCompiler.git
openarkcompiler
OpenArkCompiler
OpenArkCompiler

搜索帮助