1.4K Star 7.3K Fork 1.3K

GVP方舟编译器 / OpenArkCompiler

 / 详情

loop focus optimization example

Backlog
member
Opened this issue  
2021-05-12 17:23

以下是一个LFO典型场景,这里我们不考虑用bzero的方法, 仅从循环本身处理出发牵引lfo的实现。

typedef struct {
    unsigned int aa;
    unsigned int bb[0xf];
    unsigned short cc[(1ul << 16)];
} StructTest;


void StructInit(StructTest *base)
{
    unsigned int i;

    base->aa = 0;
    for (i = 0; i < (1ul << 16); i += 0x4) {
        base->cc[i + 0x0] = 0;
        base->cc[i + 0x1] = 0;
        base->cc[i + 0x2] = 0;
        base->cc[i + 0x3] = 0;
    }
}

我们的代码不够优化:

.L.28__2:
        add     w2, w0, #1
        strh    wzr, [x1,w0,UXTW #1]
        strh    wzr, [x1,w2,UXTW #1]
        add     w2, w0, #2
        strh    wzr, [x1,w2,UXTW #1]
        add     w2, w0, #3
        add     w0, w0, #4
        strh    wzr, [x1,w2,UXTW #1]
        uxtw    x2, w0
        cmp     x2, #65536
        blo     .L.28__2

GCC -O2对induction var做了优化:

.L368:
    strh    wzr, [x0]
    add x0, x0, 8
    strh    wzr, [x0, -6]
    strh    wzr, [x0, -4]
    strh    wzr, [x0, -2]
    cmp x0, x1
    bne .L368

GCC -O3 做了vectorization:
.L551:

    str q0, [x0], 16
    cmp x1, x0
    bne .L551

Comments (4)

yi_jiang created任务
yi_jiang set assignee to fredchow
yi_jiang assigned collaborator yi_jiang
yi_jiang set priority to Main
Expand operation logs

Let me bring up strength reduction first, and then we need to work on auto-vectorization.

With --strengthreduction given to mapleme, we can now at:

func &StructInit (reg %13 <* <$_anon1>>) void {
funcid 1

LOC 2 12
iassign <* <$_anon1>> 1 (regread ptr %13, constval u32 0)
LOC 2 13
regassign u32 %14 (constval u32 0)
regassign ptr %1 (iaddrof a64 <* <$_anon1>> 3 (regread ptr %13))
regassign a64 %2 (mul a64 (regread u32 %14, constval a64 2))
regassign u32 %3 (add u32 (regread u32 %14, constval u32 1))
regassign u32 %4 (add u32 (regread u32 %14, constval u32 2))
regassign u32 %5 (add u32 (regread u32 %14, constval u32 3))
regassign a64 %6 (add a64 (regread ptr %1, regread a64 %2))
regassign a64 %7 (mul a64 (regread u32 %3, constval a64 2))
regassign a64 %8 (mul a64 (regread u32 %4, constval a64 2))
regassign a64 %9 (mul a64 (regread u32 %5, constval a64 2))
regassign a64 %10 (add a64 (regread ptr %1, regread a64 %7))
regassign a64 %11 (add a64 (regread ptr %1, regread a64 %8))
regassign a64 %12 (add a64 (regread ptr %1, regread a64 %9))
@@2 LOC 2 14
iassign <* u16> 0 (regread a64 %6, constval u16 0)
LOC 2 15
iassign <* u16> 0 (regread a64 %10, constval u16 0)
LOC 2 16
iassign <* u16> 0 (regread a64 %11, constval u16 0)
LOC 2 17
iassign <* u16> 0 (regread a64 %12, constval u16 0)
LOC 2 13
regassign u32 %14 (add u32 (regread u32 %14, constval u32 4))
regassign a64 %12 (add a64 (regread a64 %12, constval a64 8))
regassign a64 %11 (add a64 (regread a64 %11, constval a64 8))
regassign a64 %10 (add a64 (regread a64 %10, constval a64 8))
regassign a64 %6 (add a64 (regread a64 %6, constval a64 8))
brtrue @@2 (lt i32 u64 (
cvt u64 u32 (regread u32 %14),
constval u64 0x10000))
@@1 return ()
}

Next, I'll implement Linear Function Test Replacement (LFTR) so as to delete %i (%14 above) completely.

The output from maple_me is now much improved:

func &StructInit (reg %6 <* <$_anon1>>) void {
  funcid 1

LOC 2 12
  iassign <* <$_anon1>> 1 (regread ptr %6, constval u32 0)
LOC 2 13
  regassign u32 %7 (constval u32 0)
  regassign ptr %1 (iaddrof a64 <* <$_anon1>> 3 (regread ptr %6))
  regassign a64 %2 (cvt a64 i32 (regread u32 %7))
  regassign a64 %3 (mul a64 (regread a64 %2, constval a64 2))
  regassign u64 %5 (cvt u64 u32 (regread u32 %7))
@@2   regassign a64 %4 (add a64 (regread ptr %1, regread a64 %3))
LOC 2 14
  iassign <* u16> 0 (regread a64 %4, constval u16 0)
LOC 2 15
  iassign <* u16> 0 (
    add a64 (regread a64 %4, constval a64 2),
    constval u16 0)
LOC 2 16
  iassign <* u16> 0 (
    add a64 (regread a64 %4, constval a64 4),
    constval u16 0)
LOC 2 17
  iassign <* u16> 0 (
    add a64 (regread a64 %4, constval a64 6),
    constval u16 0)
  regassign u64 %5 (add u64 (regread u64 %5, constval u64 4))
  regassign a64 %3 (add a64 (regread a64 %3, constval a64 8))
  brtrue @@2 (lt i32 u64 (regread u64 %5, constval u64 0x10000))
@@1 LOC 2 13
  return ()
}

LFTR cannot be done because (i < (1ul << 16)) causes i to be cast to 64-bit before doing the comparison, which makes it looks different from the induction variable formed by strength reduction. If the "ul" is removed from 1ul, LFTR will happen and there'll only be 1 IV increment left:

func &StructInit (reg %5 <* <$_anon1>>) void {
  funcid 1

LOC 2 12
  iassign <* <$_anon1>> 1 (regread ptr %5, constval u32 0)
LOC 2 13
  regassign u32 %6 (constval u32 0)
  regassign ptr %1 (iaddrof a64 <* <$_anon1>> 3 (regread ptr %5))
  regassign a64 %2 (cvt a64 i32 (regread u32 %6))
  regassign a64 %3 (mul a64 (regread a64 %2, constval a64 2))
@@2   regassign a64 %4 (add a64 (regread ptr %1, regread a64 %3))
LOC 2 14
  iassign <* u16> 0 (regread a64 %4, constval u16 0)
LOC 2 15
  iassign <* u16> 0 (
    add a64 (regread a64 %4, constval a64 2),
    constval u16 0)
LOC 2 16
  iassign <* u16> 0 (
    add a64 (regread a64 %4, constval a64 4),
    constval u16 0)
LOC 2 17
  iassign <* u16> 0 (
    add a64 (regread a64 %4, constval a64 6),
    constval u16 0)
  regassign a64 %3 (add a64 (regread a64 %3, constval a64 8))
  brtrue @@2 (lt i32 u32 (regread a64 %3, constval a64 0x20000))
@@1 LOC 2 13
  return ()
}

Sign in to comment

Status
Assignees
Milestones
Pull Requests
Successfully merging a pull request will close this issue.
Branches
Planed to start   -   Planed to end
-
Top level
Priority
参与者(2)
C++
1
https://git.oschina.net/openarkcompiler/OpenArkCompiler.git
git@git.oschina.net:openarkcompiler/OpenArkCompiler.git
openarkcompiler
OpenArkCompiler
OpenArkCompiler

Search

161121 f78d6d6f 1850385 154831 86f8c370 1850385