提高python循环计算效率的方法

问题描述

我想加快以下与球面模式相关的代码。这是我的实际代码的简化（我不想过度简化它，因为它可能导致对我的实际问题无效的解决方案）：

import numpy as np
import time
import math

def function_call(npp,nmax):
    matrix_a = np.random.rand(npp)
    matrix_b = np.random.rand(npp)
    a=np.random.rand()
    F = np.zeros((2*npp,2*nmax*(nmax+2)),dtype=np.complex_) 
    npa=np.arange(npp)
    for n in range(1,nmax+1,1):
        a_n = np.sqrt(1 / (2 * np.pi * n * (n + 1)))
        for m in range(-n,n+1,1):        
            b_m = (-1)**((np.abs(m) + m) / 2)
            p_mn = int(1 / 2 * (np.abs(m) + n + 1 / 2 * (1 - (-1)**(np.abs(m) + n))))
            alpha_mn = np.sqrt(((2 * n + 1) / 2) * math.factorial(n - np.abs(m)) / math.factorial(n + np.abs(m)))
            A_mn = np.zeros(npp)
            B_mn = np.zeros(npp)
            for p in range(p_mn,1):
                Cai_pmn = math.factorial(n) * ((-1)**(n + p)) / (math.factorial(p) * math.factorial(n - p)) * math.factorial(2 * p)/math.factorial(2 * p - np.abs(m) - n)
                A_mn = A_mn + Cai_pmn * (np.cos(matrix_a))**(2 * p - np.abs(m) - n)
                B_mn = B_mn + (2 * p - np.abs(m) - n) * Cai_pmn * (np.cos(matrix_a))**(np.abs(2 * p - np.abs(m) - n - 1))
            A_mn = A_mn / (2**n * math.factorial(n))
            B_mn = B_mn / (2**n * math.factorial(n))
            S_mn = alpha_mn * m * A_mn * np.sin(matrix_a)**np.abs(np.abs(m) - 1)
            D_mn = alpha_mn * (np.abs(m) * A_mn * np.cos(matrix_a) * (np.sin(matrix_a))**(np.abs(np.abs(m) - 1)) - B_mn * (np.sin(matrix_a))**(np.abs(m) + 1))
            h1 = 1j**(n+1)*np.exp(-1j*a)/(a)
            h2 = 1j**(n)*np.exp(-1j*a)/(a)
            F_s1_theta = 1j * a_n * b_m * h1 * (S_mn * np.exp(1j * m * matrix_b))
            F_s1_phi =       -a_n * b_m * h1 * (D_mn * np.exp(1j * m * matrix_b))
            F_s2_theta =      a_n * b_m * h2 * (D_mn * np.exp(1j * m * matrix_b))
            F_s2_phi =   1j * a_n * b_m * h2 * (S_mn * np.exp(1j * m * matrix_b))
            j = 2 * (n * (n + 1) + m - 1)
            F[2 * npa,j] = F_s1_theta[npa]
            F[2 * npa+1,j] = F_s1_phi[npa]
            j = 2 * (n * (n + 1) + m - 1) + 1
            F[2 * npa,j] = F_s2_theta[npa]
            F[2 * npa+1,j] = F_s2_phi[npa]               
prev_time_ep =time.time()
npp=500
nmax=80 
function_call(npp,nmax)           
print("       --- %s seconds ---" % (time.time() - prev_time_ep))

我尝试的第一个选项是对其进行矢量化（我花了一些时间，因为它并不明显）。但是内存消耗增长很快，效率不高。

我也尝试过使用 Numba，实际上我之前成功地减少了 4，但如果可能的话，我一直在寻找更大的改进。

我也读过，也许多处理或 Cython 可能是不错的选择。也许有一种方法可以在不快速增加内存使用量的情况下对其进行矢量化？

解决方法

我不确定 Python 中哪些循环比其他循环更快，但正如我在您的代码中看到的那样，有一些重复的函数调用（相同的参数）。

例如：

A_mn = A_mn / (2**n * math.factorial(n))
B_mn = B_mn / (2**n * math.factorial(n))

两个声明共享相同的分母。尝试先计算阶乘，将其保存到变量中并将其用作分母。

此外，np.exp() 函数被多次调用，所有参数都相同：

np.exp(-1j*a)/(a)
np.exp(1j * m * matrix_b)

Numpy 指数函数的执行时间可能有点慢。

同样的事情发生在所有 np.sin / np.cos(matrix_a) 调用中。

这些更改不会为您带来巨大的改进，但会有所帮助：

def function_call_2(npp,nmax):
    matrix_a = np.random.rand(npp)
    matrix_b = np.random.rand(npp)
    a=np.random.rand()
    F = np.zeros((2*npp,2*nmax*(nmax+2)),dtype=np.complex_) 
    npa=np.arange(npp)

    # Auxiliary variables
    sin_matrix_a = np.sin(matrix_a)
    cos_matrix_a = np.cos(matrix_a)

    for n in range(1,nmax+1,1):

        # Auxilary variables
        denominator = (2**n * math.factorial(n))

        a_n = np.sqrt(1 / (2 * np.pi * n * (n + 1)))
        for m in range(-n,n+1,1):        
            b_m = (-1)**((np.abs(m) + m) / 2)
            p_mn = int(1 / 2 * (np.abs(m) + n + 1 / 2 * (1 - (-1)**(np.abs(m) + n))))
            alpha_mn = np.sqrt(((2 * n + 1) / 2) * math.factorial(n - np.abs(m)) / math.factorial(n + np.abs(m)))
            A_mn = np.zeros(npp)
            B_mn = np.zeros(npp)
            for p in range(p_mn,1):
                Cai_pmn = math.factorial(n) * ((-1)**(n + p)) / (math.factorial(p) * math.factorial(n - p)) * math.factorial(2 * p)/math.factorial(2 * p - np.abs(m) - n)
                A_mn = A_mn + Cai_pmn * (cos_matrix_a)**(2 * p - np.abs(m) - n)
                B_mn = B_mn + (2 * p - np.abs(m) - n) * Cai_pmn * (cos_matrix_a)**(np.abs(2 * p - np.abs(m) - n - 1))
            A_mn = A_mn / denominator
            B_mn = B_mn / denominator
            S_mn = alpha_mn * m * A_mn * sin_matrix_a**np.abs(np.abs(m) - 1)
            D_mn = alpha_mn * (np.abs(m) * A_mn * cos_matrix_a * (sin_matrix_a)**(np.abs(np.abs(m) - 1)) - B_mn * (sin_matrix_a)**(np.abs(m) + 1))

            # Auxilary variables
            np_exp_1 = np.exp(-1j*a)/(a)

            np_exp_2 = np.exp(1j * m * matrix_b)

            h1 = 1j**(n+1)*np_exp_1
            h2 = 1j**(n)*np_exp_1

            F_s1_theta = 1j * a_n * b_m * h1 * (S_mn * np_exp_2)
            F_s1_phi =       -a_n * b_m * h1 * (D_mn * np_exp_2)
            F_s2_theta =      a_n * b_m * h2 * (D_mn * np_exp_2)
            F_s2_phi =   1j * a_n * b_m * h2 * (S_mn * np_exp_2)
            j = 2 * (n * (n + 1) + m - 1)
            F[2 * npa,j] = F_s1_theta[npa]
            F[2 * npa+1,j] = F_s1_phi[npa]
            j = 2 * (n * (n + 1) + m - 1) + 1
            F[2 * npa,j] = F_s2_theta[npa]
            F[2 * npa+1,j] = F_s2_phi[npa]

我做了一个简单的测试，我得到了大约 3 秒的执行速度：

prev_time_ep =time.time()
npp=500
nmax=80 
function_call(npp,nmax)           
print("--- %s seconds ---" % (time.time() - prev_time_ep))

prev_time_ep =time.time()
function_call_2(npp,nmax)           
print("--- %s seconds ---" % (time.time() - prev_time_ep))

输出：

--- 16.99950885772705 seconds ---
--- 14.231853580474854 seconds ---

首先，您的代码具有 O(n^3) 复杂度，这没什么大不了的。

很多代码都可以使用 array programming 来完成，这会加快很多速度，特别是使用 numpy。

我建议使用分析器找出不执行的代码并开始在循环外重写代码。

我写了一个工具 perf_tools 对这类作品很有用。它可以指导您采用某种性能驱动的解决方案。

我对你的代码做了一些工作，这里是基准测试。瓶颈在于阶乘计算。

    ================== PerfTool ==================
task                     |aver(s) |sum(s)  |count   |std     
main loop                |   0.134|  10.712|      80|   0.101
  +-second loop          |   0.134|  10.712|      80|   0.101
    +-A                  |   0.000|   0.245|    6560|   0.000
    +-B                  |   0.001|   5.648|    6560|   0.001
    +-C                  |   0.000|   0.541|    6560|   0.000
    +-D                  |   0.000|   1.505|    6560|   0.000
    +-E                  |   0.000|   1.769|    6560|   0.000
    +-F                  |   0.000|   0.867|    6560|   0.000
mx creation              |   0.000|   0.000|       1|   0.000
preparation              |   0.000|   0.000|       1|   0.000

overall                  |    0.03|   10.71|   39522|-

B 和 C 哨兵是：

      with PerfTool('B'):
                for p in range(p_mn,1):
                    Cai_pmn = math.factorial(n) * ((-1)**(n + p)) / (math.factorial(p) * math.factorial(n - p)) * math.factorial(2 * p)/math.factorial(2 * p - np.abs(m) - n)
                    A_mn = A_mn + Cai_pmn * (np.cos(matrix_a))**(2 * p - np.abs(m) - n)
                    B_mn = B_mn + (2 * p - np.abs(m) - n) * Cai_pmn * (np.cos(matrix_a))**(np.abs(2 * p - np.abs(m) - n - 1))

      with PerfTool('C'):
                A_mn = A_mn / (2**n * math.factorial(n))
                B_mn = B_mn / (2**n * math.factorial(n))

正如你所见，大部分时间都花在了 B 上，所以我在这里添加了一种缓存：

   rng = np.arange(1,1)
    cache = dict(zip(rng,factorial(rng)))
    def get_factorial(w,cache=cache):
        if w not in cache:
            cache[w] = math.factorial(w)
        return cache[w]

使用 math.factorial 代替，避免重新计算相同的值。

最后，B被重构为B_vec，这就是邪恶的根源！我已将该代码标记为 B_vec_slow，这 2 行占用了大部分时间。

            with PerfTool('B_vec'):
                prng = np.arange(p_mn,n+1)
                Cai_pmn_vec = get_factorial(n) * ((-1)**(n + prng)) / (factorial(prng) * factorial(n - prng)) * factorial(2 * prng)/factorial(2 * prng - np.abs(m) - n)
                with PerfTool('B_vec_slow'):
                    A_mn_vec = Cai_pmn_vec*np.power(cos_matrix_a[:,np.newaxis],2 * prng - np.abs(m) - n)
                    B_mn_vec = (2 * prng - np.abs(m) - n) * Cai_pmn_vec * np.power(cos_matrix_a[:,np.abs(2 * prng - np.abs(m) - n - 1))
                A_mn = np.sum(A_mn_vec,axis=1)
                B_mn = np.sum(B_mn_vec,axis=1)

结果如下：

================== PerfTool ==================
task                     |aver(s) |sum(s)  |count   |std     
main loop                |   0.072|   5.736|      80|   0.052
  +-second loop          |   0.072|   5.735|      80|   0.052
    +-A                  |   0.000|   0.194|    6560|   0.000
    +-B_vec              |   0.001|   3.490|    6560|   0.000
      +-B_vec_slow       |   0.000|   2.987|    6560|   0.000
    +-C                  |   0.000|   0.126|    6560|   0.000
    +-D                  |   0.000|   0.536|    6560|   0.000
    +-E                  |   0.000|   0.768|    6560|   0.000
    +-F                  |   0.000|   0.522|    6560|   0.000
preparation              |   0.000|   0.000|       1|   0.000
mx creation              |   0.000|   0.000|       1|   0.000

overall                  |    0.01|    5.74|   46082|-

如果你能在这两条线上工作，你可以期望在 2/3 秒内运行。

HERE：优化后的代码：https://www.codepile.net/pile/8oDyGp6Q

computation numba numpy performance python