VC ++优化从一个_mm256_loadu_ps本征加载两次

问题描述

以下是来源:

public static void RunPowershellCommand()
    {
        try
        {
            using (var runspace = RunspaceFactory.CreateRunspace())
            {
                runspace.open();
                using (var ps = System.Management.Automation.PowerShell.Create())
                {

                    ps.Runspace = runspace;
                    Command queryWmi = new Command(@"C:\Program Files\Microsoft Data Migration Assistant\SkuRecommendationDataCollectionScript.ps1");
                    queryWmi.Parameters.Add("ComputerName",CompName);
                    queryWmi.Parameters.Add("OutputFilePath",OutPutPath);
                    queryWmi.Parameters.Add("CollectionTimeInSeconds",time.ToString());
                    queryWmi.Parameters.Add("DbConnectionString",DBString);

                    ps.AddScript("Set-ExecutionPolicy RemoteSigned");
                    ps.Invoke();
                    ps.Commands.AddCommand(queryWmi);
                    var commandResults = ps.Invoke();

                    Debug.WriteLine($"PS command result {commandResults.Count} lines");
                    foreach (var commandResult in commandResults)
                    {
                        Debug.WriteLine(" Command Result : " + commandResult);
                        MessageBox.Show(commandResult.ToString());
                    }
                    if (ps.Streams.Error.Count != 0)
                    {
                        Debug.WriteLine("Errors:");
                        foreach (var error in ps.Streams.Error)
                        {
                            Debug.WriteLine("Command Error : " + error.ToString());
                        }
                    }

                    Console.WriteLine("Finished");
                    Console.ReadLine();
                }
            }
        }
        catch (Exception ex)
        {
            MessageBox.Show(ex.Message);
        }
    }

这是VC ++ 2017制作的程序集,它是调用函数的完整循环:

// Same as multiplyComplex,multiplies 4 numbers    
__forceinline __m256 multiplyComplex_x4( const __m256 x,const __m256 y )
{
    // If the inputs are [ a,b ] and [ c,d ] the formula is [ ac - bd,ad + bc ]
    
    const __m256 x2 = _mm256_movehdup_ps( x );  // [ b,b ]
    const __m256 yRev = _mm256_permute_ps( y,shuffleMask_rev64q );     // [ d,c ]
    const __m256 prod2 = _mm256_mul_ps( x2,yRev ); // [ bd,bc ]
    const __m256 x1 = _mm256_moveldup_ps( x );  // [ a,a ]
    return _mm256_fmaddsub_ps( x1,y,prod2 );
}

// Same as fftMainLoop,handles 4 complex numbers
__forceinline void fftMainLoop_x4( const __m256 omega,complex* a1c,complex* a2c )
{
    const __m256 a1 = _mm256_loadu_ps( (const float*)a1c );
    const __m256 a2 = _mm256_loadu_ps( (const float*)a2c );

    const __m256 product = multiplyComplex_x4( omega,a2 );

    _mm256_storeu_ps( (float*)a1c,_mm256_add_ps( a1,product ) );
    _mm256_storeu_ps( (float*)a2c,_mm256_sub_ps( a1,product ) );
}

代码从内存中读取的频率比我希望的要高50%。

两条说明

00007FF7CC3F11B0  lea         eax,[r8+4]  
00007FF7CC3F11B4  mov         ecx,r8d  
00007FF7CC3F11B7  vpermilps   ymm0,ymmword ptr [rdi+rax*8],0B1h  
00007FF7CC3F11BE  vmulps      ymm2,ymm0,ymm4  
00007FF7CC3F11C2  vfmaddsub231ps ymm2,ymm5,ymmword ptr [rdi+rax*8]  
00007FF7CC3F11C8  vmovups     ymm3,ymmword ptr [rdi+rcx*8]  
00007FF7CC3F11CD  vaddps      ymm0,ymm2,ymm3  
00007FF7CC3F11D1  add         r8d,8  
00007FF7CC3F11D5  vsubps      ymm1,ymm3,ymm2  
00007FF7CC3F11D9  vmovups     ymmword ptr [rdi+rcx*8],ymm0  
00007FF7CC3F11DE  vmovups     ymmword ptr [rdi+rax*8],ymm1  
00007FF7CC3F11E3  cmp         r8d,esi  
00007FF7CC3F11E6  jb          fft_run+1B0h (07FF7CC3F11B0h)  

vpermilps   ymm0,0B1h

两者都访问相同的内存地址。

是否有办法说服编译器为我的vfmaddsub231ps ymm2,ymmword ptr [rdi+rax*8] 发出vmovups而不是尝试合并负载

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)