问题:
我试图提高Python中空气动力学函数的速度。
功能集:
import numpy as np
from numba import njit
def calculate_velocity_induced_by_line_vortices(
points, origins, terminations, strengths, collapse=True
):
# Expand the dimensionality of the points input. It is now of shape (N x 1 x 3).
# This will allow NumPy to broadcast the upcoming subtractions.
points = np.expand_dims(points, axis=1)
# Define the vectors from the vortex to the points. r_1 and r_2 now both are of
# shape (N x M x 3). Each row/column pair holds the vector associated with each
# point/vortex pair.
r_1 = points - origins
r_2 = points - terminations
r_0 = r_1 - r_2
r_1_cross_r_2 = nb_2d_explicit_cross(r_1, r_2)
r_1_cross_r_2_absolute_magnitude = (
r_1_cross_r_2[:, :, 0] ** 2
+ r_1_cross_r_2[:, :, 1] ** 2
+ r_1_cross_r_2[:, :, 2] ** 2
)
r_1_length = nb_2d_explicit_norm(r_1)
r_2_length = nb_2d_explicit_norm(r_2)
# Define the radius of the line vortices. This is used to get rid of any
# singularities.
radius = 3.0e-16
# Set the lengths and the absolute magnitudes to zero, at the places where the
# lengths and absolute magnitudes are less than the vortex radius.
r_1_length[r_1_length < radius] = 0
r_2_length[r_2_length < radius] = 0
r_1_cross_r_2_absolute_magnitude[r_1_cross_r_2_absolute_magnitude < radius] = 0
# Calculate the vector dot products.
r_0_dot_r_1 = np.einsum("ijk,ijk->ij", r_0, r_1)
r_0_dot_r_2 = np.einsum("ijk,ijk->ij", r_0, r_2)
# Calculate k and then the induced velocity, ignoring any divide-by-zero or nan
# errors. k is of shape (N x M)
with np.errstate(divide="ignore", invalid="ignore"):
k = (
strengths
/ (4 * np.pi * r_1_cross_r_2_absolute_magnitude)
* (r_0_dot_r_1 / r_1_length - r_0_dot_r_2 / r_2_length)
)
# Set the shape of k to be (N x M x 1) to support numpy broadcasting in the
# subsequent multiplication.
k = np.expand_dims(k, axis=2)
induced_velocities = k * r_1_cross_r_2
# Set the values of the induced velocity to zero where there are singularities.
induced_velocities[np.isinf(induced_velocities)] = 0
induced_velocities[np.isnan(induced_velocities)] = 0
if collapse:
induced_velocities = np.sum(induced_velocities, axis=1)
return induced_velocities
@njit
def nb_2d_explicit_norm(vectors):
return np.sqrt(
(vectors[:, :, 0]) ** 2 + (vectors[:, :, 1]) ** 2 + (vectors[:, :, 2]) ** 2
)
@njit
def nb_2d_explicit_cross(a, b):
e = np.zeros_like(a)
e[:, :, 0] = a[:, :, 1] * b[:, :, 2] - a[:, :, 2] * b[:, :, 1]
e[:, :, 1] = a[:, :, 2] * b[:, :, 0] - a[:, :, 0] * b[:, :, 2]
e[:, :, 2] = a[:, :, 0] * b[:, :, 1] - a[:, :, 1] * b[:, :, 0]
return e上下文:
这个函数由Ptera软件使用,它是一种用于机翼空气动力学的开源解算器.如下面的配置文件输出所示,它是Ptera软件运行时的最大贡献者。

目前,Ptera软件只需3分钟就能运行一个典型的案例,而我的目标是将其控制在1分钟以下。
该函数接受一组点、起源、终止和优点。在每一个点,它发现由线涡引起的诱导速度,其特征是起源、终止和强度组。如果崩塌是真的,那么输出就是由于涡旋而在每个点诱导的累积速度。如果为假,则函数在每个点输出每个涡旋对速度的贡献。
在典型的运行过程中,速度函数大约被调用2000次。首先,调用涉及输入参数相对较小的向量(大约200个点、起源、终止和强度)。后来的调用涉及到大量的输入参数(大约400个点和大约6,000个起源、终止和优势)。对于所有大小的输入,一个理想的解决方案是快速的,但是提高大输入调用的速度更重要。
为了进行测试,我建议您使用自己的函数实现运行以下脚本:
import timeit
import matplotlib.pyplot as plt
import numpy as np
n_repeat = 2
n_execute = 10 ** 3
min_oom = 0
max_oom = 3
times_py = []
for i in range(max_oom - min_oom + 1):
n_elem = 10 ** i
n_elem_pretty = np.format_float_scientific(n_elem, 0)
print("Number of elements: " + n_elem_pretty)
# Benchmark Python.
print("\tBenchmarking Python...")
setup = '''
import numpy as np
these_points = np.random.random((''' + str(n_elem) + ''', 3))
these_origins = np.random.random((''' + str(n_elem) + ''', 3))
these_terminations = np.random.random((''' + str(n_elem) + ''', 3))
these_strengths = np.random.random(''' + str(n_elem) + ''')
def calculate_velocity_induced_by_line_vortices(points, origins, terminations,
strengths, collapse=True):
pass
'''
statement = '''
results_orig = calculate_velocity_induced_by_line_vortices(these_points, these_origins,
these_terminations,
these_strengths)
'''
times = timeit.repeat(repeat=n_repeat, stmt=statement, setup=setup, number=n_execute)
time_py = min(times)/n_execute
time_py_pretty = np.format_float_scientific(time_py, 2)
print("\t\tAverage Time per Loop: " + time_py_pretty + " s")
# Record the times.
times_py.append(time_py)
sizes = [10 ** i for i in range(max_oom - min_oom + 1)]
fig, ax = plt.subplots()
ax.plot(sizes, times_py, label='Python')
ax.set_xscale("log")
ax.set_xlabel("Size of List or Array (elements)")
ax.set_ylabel("Average Time per Loop (s)")
ax.set_title(
"Comparison of Different Optimization Methods\nBest of "
+ str(n_repeat)
+ " Runs, each with "
+ str(n_execute)
+ " Loops"
)
ax.legend()
plt.show()以前的尝试:
我之前加速这个函数的尝试包括将它矢量化(这很好,所以我保留了这些更改),并尝试了Numba的JIT编译器。我和Numba的结果好坏参半。当我尝试用Numba对整个速度函数进行修改时,我的结果比以前慢得多。然而,我发现Numba显着地加快了跨乘积和规范功能,这是我在上面实现的。
更新:
更新1:
根据水星的评论(后来被删除),我取代了
points = np.expand_dims(points, axis=1)
r_1 = points - origins
r_2 = points - terminations通过对以下函数的两个调用:
@njit
def subtract(a, b):
c = np.empty((a.shape[0], b.shape[0], 3))
for i in range(a.shape[0]):
for j in range(b.shape[0]):
for k in range(3):
c[i, j, k] = a[i, k] - b[j, k]
return c这导致速度从227秒提高到220秒,这更好!然而,它仍然不够快。
我还尝试将njit快速数学标志设置为true,并使用numba函数代替对np.einsum的调用。也没有提高速度。
更新2:
有了Jér me Richard的回答,现在的运行时间是156秒,减少了29%!我很满意地接受了这个答案,但是如果你认为你可以改进他们的工作,可以提出其他建议!
发布于 2021-03-23 03:51:00
首先,Numba可以执行并行计算(),如果您主要使用parallel=True和prange手动请求它,则会产生更快的代码。这对于大数组(但对于小型数组)是有用的。
此外,您的计算主要是内存绑定。因此,您应该避免在没有多次重用的情况下创建大型数组,或者更普遍地避免在不能动态地重新计算它们时(以相对便宜的方式)。例如,r_0就是这样的情况。
此外,内存访问模式很重要:当访问是内存中的相邻的时,矢量化更有效,而缓存/RAM的使用效率更高。因此,arr[0, :, :] = 0应该比arr[:, :, 0] = 0更快。类似地,arr[:, :, 0] = arr[:, :, 1] = 0应该比arr[:, :, 0:2] = 0慢,因为前者执行非连续内存传递,而后者只执行一次连续内存传递。有时,它可能是有益的转换您的数据,以便下面的计算更快。
此外,Numpy倾向于创建许多临时数组(),它们的分配成本很高。当输入数组很小时,这是一个很大的问题。在大多数情况下,Numba jit可以避免这种情况。
最后,关于您的计算,对大数组使用GPU可能是个好主意(对于小数组肯定不是这样)。您可以查看一下cupy或clpy就可以很容易地做到这一点。
下面是一个在CPU上工作的优化实现:
import numpy as np
from numba import njit, prange
@njit(parallel=True)
def subtract(a, b):
c = np.empty((a.shape[0], b.shape[0], 3))
for i in prange(c.shape[0]):
for j in range(c.shape[1]):
for k in range(3):
c[i, j, k] = a[i, k] - b[j, k]
return c
@njit(parallel=True)
def nb_2d_explicit_norm(vectors):
res = np.empty((vectors.shape[0], vectors.shape[1]))
for i in prange(res.shape[0]):
for j in range(res.shape[1]):
res[i, j] = np.sqrt(vectors[i, j, 0] ** 2 + vectors[i, j, 1] ** 2 + vectors[i, j, 2] ** 2)
return res
# NOTE: better memory access pattern
@njit(parallel=True)
def nb_2d_explicit_cross(a, b):
e = np.empty(a.shape)
for i in prange(e.shape[0]):
for j in range(e.shape[1]):
e[i, j, 0] = a[i, j, 1] * b[i, j, 2] - a[i, j, 2] * b[i, j, 1]
e[i, j, 1] = a[i, j, 2] * b[i, j, 0] - a[i, j, 0] * b[i, j, 2]
e[i, j, 2] = a[i, j, 0] * b[i, j, 1] - a[i, j, 1] * b[i, j, 0]
return e
# NOTE: avoid the slow building of temporary arrays
@njit(parallel=True)
def cross_absolute_magnitude(cross):
return cross[:, :, 0] ** 2 + cross[:, :, 1] ** 2 + cross[:, :, 2] ** 2
# NOTE: avoid the slow building of temporary arrays again and multiple pass in memory
# Warning: do the work in-place
@njit(parallel=True)
def discard_singularities(arr):
for i in prange(arr.shape[0]):
for j in range(arr.shape[1]):
for k in range(3):
if np.isinf(arr[i, j, k]) or np.isnan(arr[i, j, k]):
arr[i, j, k] = 0.0
@njit(parallel=True)
def compute_k(strengths, r_1_cross_r_2_absolute_magnitude, r_0_dot_r_1, r_1_length, r_0_dot_r_2, r_2_length):
return (strengths
/ (4 * np.pi * r_1_cross_r_2_absolute_magnitude)
* (r_0_dot_r_1 / r_1_length - r_0_dot_r_2 / r_2_length)
)
@njit(parallel=True)
def rDotProducts(b, c):
assert b.shape == c.shape and b.shape[2] == 3
n, m = b.shape[0], b.shape[1]
ab = np.empty((n, m))
ac = np.empty((n, m))
for i in prange(n):
for j in range(m):
ab[i, j] = 0.0
ac[i, j] = 0.0
for k in range(3):
a = b[i, j, k] - c[i, j, k]
ab[i, j] += a * b[i, j, k]
ac[i, j] += a * c[i, j, k]
return (ab, ac)
# Compute `np.sum(arr, axis=1)` in parallel.
@njit(parallel=True)
def collapseArr(arr):
assert arr.shape[2] == 3
n, m = arr.shape[0], arr.shape[1]
res = np.empty((n, 3))
for i in prange(n):
res[i, 0] = np.sum(arr[i, :, 0])
res[i, 1] = np.sum(arr[i, :, 1])
res[i, 2] = np.sum(arr[i, :, 2])
return res
def calculate_velocity_induced_by_line_vortices(points, origins, terminations, strengths, collapse=True):
r_1 = subtract(points, origins)
r_2 = subtract(points, terminations)
# NOTE: r_0 is computed on the fly by rDotProducts
r_1_cross_r_2 = nb_2d_explicit_cross(r_1, r_2)
r_1_cross_r_2_absolute_magnitude = cross_absolute_magnitude(r_1_cross_r_2)
r_1_length = nb_2d_explicit_norm(r_1)
r_2_length = nb_2d_explicit_norm(r_2)
radius = 3.0e-16
r_1_length[r_1_length < radius] = 0
r_2_length[r_2_length < radius] = 0
r_1_cross_r_2_absolute_magnitude[r_1_cross_r_2_absolute_magnitude < radius] = 0
r_0_dot_r_1, r_0_dot_r_2 = rDotProducts(r_1, r_2)
with np.errstate(divide="ignore", invalid="ignore"):
k = compute_k(strengths, r_1_cross_r_2_absolute_magnitude, r_0_dot_r_1, r_1_length, r_0_dot_r_2, r_2_length)
k = np.expand_dims(k, axis=2)
induced_velocities = k * r_1_cross_r_2
discard_singularities(induced_velocities)
if collapse:
induced_velocities = collapseArr(induced_velocities)
return induced_velocities在我的机器上,这段代码比大小为的10**3数组上的初始实现快2.5倍。它还使用了一点少内存。
https://stackoverflow.com/questions/66750661
复制相似问题