文章/答案/技术大牛

发布

问核插值C++
EN

Code Review用户

提问于 2020-10-21 03:58:25

回答 1查看 105关注 0票数 4

该程序的目的是从规则网格节点的数值计算纳克= nx_ny_nz粒子在拉格朗日点处的速度。我怎样才能加快计算速度？我希望收到有关以下守则的建议：

#include <iomanip>
#include <omp.h>
#include <vector>
#include <cmath> 
#include <ctime>
#include <iostream>
#include <string>
#include <fstream>
#include <string.h>
#include <sstream>
#include <istream>
#include <stdlib.h>     
#include <algorithm>    

using namespace std;

void functionM4( vector<double> &b, vector<double> &w)   
// this is an interpolation function
{

    w[0]= 0.5*b[0]*b[0]*b[0]+2.5*b[0]*b[0]+4.0*b[0]+2.0;
    w[1]=-1.5*b[1]*b[1]*b[1]-2.5*b[1]*b[1]+1.0;
    w[2]= 1.5*b[2]*b[2]*b[2]-2.5*b[2]*b[2]+1.0;
    w[3]=-0.5*b[3]*b[3]*b[3]+2.5*b[3]*b[3]-4.0*b[3]+2.0;

}

int main()
{
    double pi = 3.141592653589793;

    int nx0 = 400;            // number of grid in X direction
    int ny0 = 400;            // number of grid in Y direction
    int nz0 = 400;            // number of grid in Y direction

    int nx = nx0+10;             
    int ny = ny0+10;            
    int nz = nz0+10;           
    int ng = nx*ny*nz;

    double xmin = 0.0;      // set up domain  
    double xmax = 2.0*pi;             
    double ymin = 0.0;            
    double ymax = 2.0*pi;             
    double zmin = 0.0;        
    double zmax = 2.0*pi;   

    double dx = (xmax-xmin)/(nx0);  //spacing step
    double dy = (ymax-ymin)/(ny0);      
    double dz = (zmax-zmin)/(nz0);      
    
    vector<double> x(nx,0.0);           
    vector<double> y(ny,0.0);            
    vector<double> z(nz,0.0);       

    for (int i=0;i<nx;i++)
        x[i] = xmin+(double)(i-5)*dx;  //building grid for x direction
        
    for (int j=0;j<ny;j++)
        y[j] = ymin+(double)(j-5)*dy;

    for (int k=0;k<nz;k++)
        z[k] = zmin+(double)(k-5)*dz;
 // ug is velocity component u on grid at the beginning
    vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    vector<vector<vector<double> > > vg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > wg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));   

// particles at Lagrangian points
    vector<vector<vector<double> > > xpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    vector<vector<vector<double> > > ypStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > zpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

//the particle velocity at Lagrangian locations
    vector<vector<vector<double> > > upStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));        
    vector<vector<vector<double> > > vpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > wpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    
    int i,j,k;
    #pragma omp parallel for private (j,k) schedule(dynamic)
    for (i=0;i<nx;i++)
        for (j=0;j<ny;j++)
            for (k=0;k<nz;k++)
            {
                //initial values of velocity components at grid nodes
                ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]);    
                vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
                wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
            }


    double dt = 0.001;  // time step

//the particle velocity upStar at Lagrangian location xpStar is 
//obtained from the velocity ug at the grid nodes (x,y,z) 
//using a kernel interpolation function.

    cout<<" start "<<endl;
    #pragma omp parallel for private (j,k) schedule(dynamic)
    for (i=0;i<nx;i++) 
        for (j=0;j<ny;j++) 
            for (k=0;k<nz;k++){

                xpStar[i][j][k]= x[i] + dt*ug[i][j][k]; 
                ypStar[i][j][k]= y[j] + dt*vg[i][j][k];
                zpStar[i][j][k]= z[k] + dt*wg[i][j][k];
            }

    #pragma omp parallel for private (j,k) schedule(dynamic)   
    for (i=2;i<nx-2;i++) 
        for (j=2;j<ny-2;j++) 
            for (k=2;k<nz-2;k++){


                int Xindex = (int)((xpStar[i][j][k]- x[0])/dx);
                int Yindex = (int)((ypStar[i][j][k]- y[0])/dy);
                int Zindex = (int)((zpStar[i][j][k]- z[0])/dz);

                int west  = Xindex-1;  int est   = Xindex+2;
                int south = Yindex-1;  int north = Yindex+2;                
                int front = Zindex-1;  int back  = Zindex+2;    

                vector<double> BX(4,0.0);
                vector<double> BY(4,0.0);
                vector<double> BZ(4,0.0);

                vector<double> wx(4,0.0);
                vector<double> wy(4,0.0);
                vector<double> wz(4,0.0);

                for (int m=west;m<=est;m++){
                    BX[m-west]=(x[m]-xpStar[i][j][k])/dx;}
    
                for (int n=south;n<=north;n++){
                    BY[n-south]=(y[n]-ypStar[i][j][k])/dy;}

                for (int q=front;q<=back;q++){
                    BZ[q-front]=(z[q]-zpStar[i][j][k])/dz;}

                functionM4(BX,wx);
                functionM4(BY,wy);
                functionM4(BZ,wz);

                for (int m=west;m<=est;m++) 
                    for (int n=south;n<=north;n++)
                        for (int q=front;q<=back;q++){ 
                            
                            double w=wx[m-west]*wy[n-south]*wz[q-front];

                            upStar[i][j][k] += ug[m][n][q]*w;
                            vpStar[i][j][k] += vg[m][n][q]*w;
                            wpStar[i][j][k] += wg[m][n][q]*w;
                        }
            }


    cout<<" finish ! "<<endl;

    //-------------------------------------Checking results------------------------------------------//
    vector<vector<vector<double> > > exact (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
    vector<vector<vector<double> > > error (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

    #pragma omp parallel for
    for (int i=3;i<nx-3;i++)
        for (int j=3;j<ny-3;j++)
            for (int k=3;k<nz-3;k++)
            {
                exact[i][j][k] = -sin(xpStar[i][j][k])*cos(ypStar[i][j][k])*sin(zpStar[i][j][k]);
                error[i][j][k] = abs(exact[i][j][k]-upStar[i][j][k]);
            }

    double aa = 0.0;
    double bb = 0.0;
    double cc = 0.0;
    double dd = 0.0;
    double maxi = 0.0;
    for (int i=3;i<nx-3;i++)
        for (int j=3;j<ny-3;j++)
            for (int k=3;k<nz-3;k++)
            {
                aa += abs(exact[i][j][k]);
                bb += error[i][j][k];
                cc += error[i][j][k]*error[i][j][k];
                dd += exact[i][j][k]*exact[i][j][k];

                if (abs(error[i][j][k])>maxi)
                    maxi = abs(error[i][j][k]);
            }

    cout<<" L1_norm = "<<bb/aa<<endl;
    cout<<" L2_norm = "<<sqrt(cc/dd)<<endl;
    cout<<" L3_norm = "<<maxi<<endl;

    return 0;
//Compiler: g++ interpolation.cpp -o running -fopenmp
//export OMP_NUM_THREADS=30;
//Run:./running
}

c++11

openmp

回答 1

Code Review用户

发布于 2020-10-21 11:41:55

(注意:我对OMP一无所知，所以我没有评论它！)

double pi = 3.141592653589793;

int nx0 = 400;            // number of grid in X direction
...

常量应该声明为const (它帮助程序员，如果不是编译器的话)。

for (i=0;i<nx;i++)
    for (j=0;j<ny;j++)
        for (k=0;k<nz;k++)
        {
            ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]);    
            //initial values of velocity components at grid nodes
            vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
            wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
        }

我知道这只是设置阶段，但是对于相同的值，这是很多重复的cos和sin计算。我们可以将计算移到外循环，或者(更好的)我们可以分别计算sin(x[n])、cos(x[n])、sin(y[n])等。

一次迭代所有三个速度网格可能比三个单独的循环要慢得多，以便独立地填充每个速度网格。(由于缓存丢失)。

vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

就内存分配而言，这是相当复杂的，所以创建(复制大量的一维和二维向量)和迭代(分配的内存不太可能是连续的，所以我们不知道如何以高速缓存高效的方式访问元素)是很慢的。

相反，尝试分配一个向量，并根据3d坐标计算索引，如下所示：

template<class T>
struct array_3d
{
    using index_t = typename std::vector<T>::size_type;

    array_3d(index_t width, index_t height, index_t depth):
        m_width(width), m_height(height), m_depth(depth),
        m_data(m_width * m_height * m_depth) { }

    T& at(index_t x, index_t y, index_t z)
    {
        return m_data[x + y * m_width + z * m_width * m_height];
    }
    
    T const& at(index_t x, index_t y, index_t z) const
    {
        return m_data[x + y * m_width + z * m_width * m_height];
    }

    index_t m_width, m_height, m_depth;
    std::vector<T> m_data;
};

如果我们在迭代这个数据结构时非常小心，我们就可以避免在内存中跳过，例如：

for (int k=0;k<nz;k++)
    for (int j=0;j<ny;j++)
        for (int i=0;i<nx;i++)
            xpStar.at(i, j, k)= x[i] + dt*ug.at(i, j, k);

xpStar.m_data的每次访问都会改变数组中的下一个元素。(因此，我们甚至可以避免从坐标中计算索引，只需增加迭代器，这可能会更快，也可能不会更快)。

(注意:我在上面实现它的方式是最内部的x组件(所以x+1是index+1)。这与您的初始代码(它有最内部的z )相反，但是只要您在迭代时按正确的顺序遍历维度，选择哪一个并不重要)。

            vector<double> BX(4,0.0);
            vector<double> BY(4,0.0);
            vector<double> BZ(4,0.0);

            vector<double> wx(4,0.0);
            vector<double> wy(4,0.0);
            vector<double> wz(4,0.0);

由于这些都是固定的大小，所以使用std::array<double, 4>代替！

            for (int m=west;m<=est;m++) 
                for (int n=south;n<=north;n++)
                    for (int q=front;q<=back;q++){ 
                        
                        double w=wx[m-west]*wy[n-south]*wz[q-front];

                        upStar[i][j][k] += ug[m][n][q]*w;
                        vpStar[i][j][k] += vg[m][n][q]*w;
                        wpStar[i][j][k] += wg[m][n][q]*w;
                    }

同样，对于upStar、vpStar和wpStar (然后预先计算w)，将其分割成单独的循环可能会更快，例如：

            array<double, 4*4*4> w;
            
            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        w[m + n*4 + q*4*4] = wx[m]*wy[n]*wz[q];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        upStar.at(i, j, k) += ug.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        vpStar.at(i, j, k) += vg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        wpStar.at(i, j, k) += wg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

票数 3

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/250938

复制

相似问题

问核插值C++
EN

回答 1

Code Review用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问核插值C++EN

回答 1

Code Review用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问核插值C++
EN