unknown
1970-01-01 00:00:00 UTC
Will the N*N be hoisted?
Hi All,
This one looks somewhat similar to the last example, but is different.
int foo(int N, int j, int *x, int *z)
{
int y = N;
N += 7;
N >>= 3;
int i;
for(i = 0; i< j; i++)
{
x += N*N << 3;
z = x + N;
y = y + *x + *z;
}
return y;
}
Assembly of the loop at -O3.
.p2align 4,,15
#<loop> Loop body line 7, nesting depth: 1, estimated iterations: 1000
.loc 1 9 0
# 8 {
# 9 x += N*N << 3;
movl %eax,%ebx # [0]
.loc 1 11 0
# 10 z = x + N;
# 11 y = y + *x + *z;
addl $1,%ebp # [0]
.loc 1 9 0
imull %eax,%ebx # [1]
shll $3,%ebx # [4]
shll $2,%ebx # [5]
addl %ebx,%edi # [6]
addl %ebx,%esi # [6]
.loc 1 11 0
movl 0(%edi),%ecx # [7] id:23
addl 0(%esi),%ecx # [10]
addl %ecx,%edx # [13]
cmpl 36(%esp),%ebp # [13] j
jl .Lt_0_3586 # [16]
As we see, the imul instruction remains in the loop.
(and two consequent shll instructions, my guess is that CG is thinking
there should not be such input from WOPT, so it is not optimized in CG,
though it is simple. )
It looks like SSA PRE omitted the rhs of Iv_update statement x+= N*N<<3,
and VNFRE is only doing one level of CSE, say, promoting the ASHR + LDC 3
out of the loop.
I am curious why SSA PRE is omitting the expression here. By disabling
this in opt_etable.cxx, the result looks good for this test case. I wonder
if there is any correctness issue for some other test case, or performance
issue?
It should be noted one strength reduction transformation is done for z for
this case. Also replacing "N>>=3;" with "N*=5;" results in similar
sub-optimal code.
Best Regards,
Yiran Wang
------------------------------------------------------------------------------
Build for Windows Store.
http://p.sf.net/sfu/windows-dev2dev
_______________________________________________
Open64-devel mailing list
https://lists.sourceforge.net/lists/listinfo/open64-devel
This one looks somewhat similar to the last example, but is different.
int foo(int N, int j, int *x, int *z)
{
int y = N;
N += 7;
N >>= 3;
int i;
for(i = 0; i< j; i++)
{
x += N*N << 3;
z = x + N;
y = y + *x + *z;
}
return y;
}
Assembly of the loop at -O3.
.p2align 4,,15
#<loop> Loop body line 7, nesting depth: 1, estimated iterations: 1000
.loc 1 9 0
# 8 {
# 9 x += N*N << 3;
movl %eax,%ebx # [0]
.loc 1 11 0
# 10 z = x + N;
# 11 y = y + *x + *z;
addl $1,%ebp # [0]
.loc 1 9 0
imull %eax,%ebx # [1]
shll $3,%ebx # [4]
shll $2,%ebx # [5]
addl %ebx,%edi # [6]
addl %ebx,%esi # [6]
.loc 1 11 0
movl 0(%edi),%ecx # [7] id:23
addl 0(%esi),%ecx # [10]
addl %ecx,%edx # [13]
cmpl 36(%esp),%ebp # [13] j
jl .Lt_0_3586 # [16]
As we see, the imul instruction remains in the loop.
(and two consequent shll instructions, my guess is that CG is thinking
there should not be such input from WOPT, so it is not optimized in CG,
though it is simple. )
It looks like SSA PRE omitted the rhs of Iv_update statement x+= N*N<<3,
and VNFRE is only doing one level of CSE, say, promoting the ASHR + LDC 3
out of the loop.
I am curious why SSA PRE is omitting the expression here. By disabling
this in opt_etable.cxx, the result looks good for this test case. I wonder
if there is any correctness issue for some other test case, or performance
issue?
It should be noted one strength reduction transformation is done for z for
this case. Also replacing "N>>=3;" with "N*=5;" results in similar
sub-optimal code.
Best Regards,
Yiran Wang
------------------------------------------------------------------------------
Build for Windows Store.
http://p.sf.net/sfu/windows-dev2dev
_______________________________________________
Open64-devel mailing list
https://lists.sourceforge.net/lists/listinfo/open64-devel
--
Regards,
Lai Jian-Xin
--089e015368e0ad55a604e01dac9a
Content-Type: text/html; charset=ISO-8859-1
Content-Transfer-Encoding: quoted-printable
<div dir="ltr"><div>
Regards,
Lai Jian-Xin
--089e015368e0ad55a604e01dac9a
Content-Type: text/html; charset=ISO-8859-1
Content-Transfer-Encoding: quoted-printable
<div dir="ltr"><div>