首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >核插值C++

核插值C++
EN

Code Review用户
提问于 2020-10-21 03:58:25
回答 1查看 105关注 0票数 4

该程序的目的是从规则网格节点的数值计算纳克= nx_ny_nz粒子在拉格朗日点处的速度。我怎样才能加快计算速度?我希望收到有关以下守则的建议:

代码语言:javascript
复制
#include <iomanip>
#include <omp.h>
#include <vector>
#include <cmath> 
#include <ctime>
#include <iostream>
#include <string>
#include <fstream>
#include <string.h>
#include <sstream>
#include <istream>
#include <stdlib.h>     
#include <algorithm>    

using namespace std;

void functionM4( vector<double> &b, vector<double> &w)   
// this is an interpolation function
{

    w[0]= 0.5*b[0]*b[0]*b[0]+2.5*b[0]*b[0]+4.0*b[0]+2.0;
    w[1]=-1.5*b[1]*b[1]*b[1]-2.5*b[1]*b[1]+1.0;
    w[2]= 1.5*b[2]*b[2]*b[2]-2.5*b[2]*b[2]+1.0;
    w[3]=-0.5*b[3]*b[3]*b[3]+2.5*b[3]*b[3]-4.0*b[3]+2.0;

}

int main()
{
    double pi = 3.141592653589793;

    int nx0 = 400;            // number of grid in X direction
    int ny0 = 400;            // number of grid in Y direction
    int nz0 = 400;            // number of grid in Y direction

    int nx = nx0+10;             
    int ny = ny0+10;            
    int nz = nz0+10;           
    int ng = nx*ny*nz;

    double xmin = 0.0;      // set up domain  
    double xmax = 2.0*pi;             
    double ymin = 0.0;            
    double ymax = 2.0*pi;             
    double zmin = 0.0;        
    double zmax = 2.0*pi;   

    double dx = (xmax-xmin)/(nx0);  //spacing step
    double dy = (ymax-ymin)/(ny0);      
    double dz = (zmax-zmin)/(nz0);      
    
    vector<double> x(nx,0.0);           
    vector<double> y(ny,0.0);            
    vector<double> z(nz,0.0);       

    for (int i=0;i<nx;i++)
        x[i] = xmin+(double)(i-5)*dx;  //building grid for x direction
        
    for (int j=0;j<ny;j++)
        y[j] = ymin+(double)(j-5)*dy;

    for (int k=0;k<nz;k++)
        z[k] = zmin+(double)(k-5)*dz;
 // ug is velocity component u on grid at the beginning
    vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    vector<vector<vector<double> > > vg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > wg (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));   

// particles at Lagrangian points
    vector<vector<vector<double> > > xpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    vector<vector<vector<double> > > ypStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > zpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

//the particle velocity at Lagrangian locations
    vector<vector<vector<double> > > upStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));        
    vector<vector<vector<double> > > vpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));     
    vector<vector<vector<double> > > wpStar (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 
    
    int i,j,k;
    #pragma omp parallel for private (j,k) schedule(dynamic)
    for (i=0;i<nx;i++)
        for (j=0;j<ny;j++)
            for (k=0;k<nz;k++)
            {
                //initial values of velocity components at grid nodes
                ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]);    
                vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
                wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
            }


    double dt = 0.001;  // time step

//the particle velocity upStar at Lagrangian location xpStar is 
//obtained from the velocity ug at the grid nodes (x,y,z) 
//using a kernel interpolation function.

    cout<<" start "<<endl;
    #pragma omp parallel for private (j,k) schedule(dynamic)
    for (i=0;i<nx;i++) 
        for (j=0;j<ny;j++) 
            for (k=0;k<nz;k++){

                xpStar[i][j][k]= x[i] + dt*ug[i][j][k]; 
                ypStar[i][j][k]= y[j] + dt*vg[i][j][k];
                zpStar[i][j][k]= z[k] + dt*wg[i][j][k];
            }

    #pragma omp parallel for private (j,k) schedule(dynamic)   
    for (i=2;i<nx-2;i++) 
        for (j=2;j<ny-2;j++) 
            for (k=2;k<nz-2;k++){


                int Xindex = (int)((xpStar[i][j][k]- x[0])/dx);
                int Yindex = (int)((ypStar[i][j][k]- y[0])/dy);
                int Zindex = (int)((zpStar[i][j][k]- z[0])/dz);

                int west  = Xindex-1;  int est   = Xindex+2;
                int south = Yindex-1;  int north = Yindex+2;                
                int front = Zindex-1;  int back  = Zindex+2;    

                vector<double> BX(4,0.0);
                vector<double> BY(4,0.0);
                vector<double> BZ(4,0.0);

                vector<double> wx(4,0.0);
                vector<double> wy(4,0.0);
                vector<double> wz(4,0.0);

                for (int m=west;m<=est;m++){
                    BX[m-west]=(x[m]-xpStar[i][j][k])/dx;}
    
                for (int n=south;n<=north;n++){
                    BY[n-south]=(y[n]-ypStar[i][j][k])/dy;}

                for (int q=front;q<=back;q++){
                    BZ[q-front]=(z[q]-zpStar[i][j][k])/dz;}

                functionM4(BX,wx);
                functionM4(BY,wy);
                functionM4(BZ,wz);

                for (int m=west;m<=est;m++) 
                    for (int n=south;n<=north;n++)
                        for (int q=front;q<=back;q++){ 
                            
                            double w=wx[m-west]*wy[n-south]*wz[q-front];

                            upStar[i][j][k] += ug[m][n][q]*w;
                            vpStar[i][j][k] += vg[m][n][q]*w;
                            wpStar[i][j][k] += wg[m][n][q]*w;
                        }
            }


    cout<<" finish ! "<<endl;

    //-------------------------------------Checking results------------------------------------------//
    vector<vector<vector<double> > > exact (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));
    vector<vector<vector<double> > > error (nx,vector<vector<double> >(ny,vector <double>(nz,0.0)));

    #pragma omp parallel for
    for (int i=3;i<nx-3;i++)
        for (int j=3;j<ny-3;j++)
            for (int k=3;k<nz-3;k++)
            {
                exact[i][j][k] = -sin(xpStar[i][j][k])*cos(ypStar[i][j][k])*sin(zpStar[i][j][k]);
                error[i][j][k] = abs(exact[i][j][k]-upStar[i][j][k]);
            }

    double aa = 0.0;
    double bb = 0.0;
    double cc = 0.0;
    double dd = 0.0;
    double maxi = 0.0;
    for (int i=3;i<nx-3;i++)
        for (int j=3;j<ny-3;j++)
            for (int k=3;k<nz-3;k++)
            {
                aa += abs(exact[i][j][k]);
                bb += error[i][j][k];
                cc += error[i][j][k]*error[i][j][k];
                dd += exact[i][j][k]*exact[i][j][k];

                if (abs(error[i][j][k])>maxi)
                    maxi = abs(error[i][j][k]);
            }

    cout<<" L1_norm = "<<bb/aa<<endl;
    cout<<" L2_norm = "<<sqrt(cc/dd)<<endl;
    cout<<" L3_norm = "<<maxi<<endl;

    return 0;
//Compiler: g++ interpolation.cpp -o running -fopenmp
//export OMP_NUM_THREADS=30;
//Run:./running
}
EN

回答 1

Code Review用户

发布于 2020-10-21 11:41:55

(注意:我对OMP一无所知,所以我没有评论它!)

代码语言:javascript
复制
double pi = 3.141592653589793;

int nx0 = 400;            // number of grid in X direction
...

常量应该声明为const (它帮助程序员,如果不是编译器的话)。

代码语言:javascript
复制
for (i=0;i<nx;i++)
    for (j=0;j<ny;j++)
        for (k=0;k<nz;k++)
        {
            ug[i][j][k] = -sin(x[i])*cos(y[j])*sin(z[k]);    
            //initial values of velocity components at grid nodes
            vg[i][j][k] = -cos(x[i])*sin(y[j])*sin(z[k]);
            wg[i][j][k] = -2.0*cos(x[i])*cos(y[j])*cos(z[k]);
        }

我知道这只是设置阶段,但是对于相同的值,这是很多重复的cossin计算。我们可以将计算移到外循环,或者(更好的)我们可以分别计算sin(x[n])cos(x[n])sin(y[n])等。

一次迭代所有三个速度网格可能比三个单独的循环要慢得多,以便独立地填充每个速度网格。(由于缓存丢失)。

代码语言:javascript
复制
vector<vector<vector<double> > > ug (nx,vector<vector<double> >(ny,vector <double>(nz,0.0))); 

就内存分配而言,这是相当复杂的,所以创建(复制大量的一维和二维向量)和迭代(分配的内存不太可能是连续的,所以我们不知道如何以高速缓存高效的方式访问元素)是很慢的。

相反,尝试分配一个向量,并根据3d坐标计算索引,如下所示:

代码语言:javascript
复制
template<class T>
struct array_3d
{
    using index_t = typename std::vector<T>::size_type;

    array_3d(index_t width, index_t height, index_t depth):
        m_width(width), m_height(height), m_depth(depth),
        m_data(m_width * m_height * m_depth) { }

    T& at(index_t x, index_t y, index_t z)
    {
        return m_data[x + y * m_width + z * m_width * m_height];
    }
    
    T const& at(index_t x, index_t y, index_t z) const
    {
        return m_data[x + y * m_width + z * m_width * m_height];
    }

    index_t m_width, m_height, m_depth;
    std::vector<T> m_data;
};

如果我们在迭代这个数据结构时非常小心,我们就可以避免在内存中跳过,例如:

代码语言:javascript
复制
for (int k=0;k<nz;k++)
    for (int j=0;j<ny;j++)
        for (int i=0;i<nx;i++)
            xpStar.at(i, j, k)= x[i] + dt*ug.at(i, j, k);

xpStar.m_data的每次访问都会改变数组中的下一个元素。(因此,我们甚至可以避免从坐标中计算索引,只需增加迭代器,这可能会更快,也可能不会更快)。

(注意:我在上面实现它的方式是最内部的x组件(所以x+1index+1)。这与您的初始代码(它有最内部的z )相反,但是只要您在迭代时按正确的顺序遍历维度,选择哪一个并不重要)。

代码语言:javascript
复制
            vector<double> BX(4,0.0);
            vector<double> BY(4,0.0);
            vector<double> BZ(4,0.0);

            vector<double> wx(4,0.0);
            vector<double> wy(4,0.0);
            vector<double> wz(4,0.0);

由于这些都是固定的大小,所以使用std::array<double, 4>代替!

代码语言:javascript
复制
            for (int m=west;m<=est;m++) 
                for (int n=south;n<=north;n++)
                    for (int q=front;q<=back;q++){ 
                        
                        double w=wx[m-west]*wy[n-south]*wz[q-front];

                        upStar[i][j][k] += ug[m][n][q]*w;
                        vpStar[i][j][k] += vg[m][n][q]*w;
                        wpStar[i][j][k] += wg[m][n][q]*w;
                    }

同样,对于upStarvpStarwpStar (然后预先计算w),将其分割成单独的循环可能会更快,例如:

代码语言:javascript
复制
            array<double, 4*4*4> w;
            
            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        w[m + n*4 + q*4*4] = wx[m]*wy[n]*wz[q];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        upStar.at(i, j, k) += ug.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        vpStar.at(i, j, k) += vg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];

            for (int q=0;q!=4;q++)
                for (int n=0;n!=4;n++)
                    for (int m=0;m!=4;m++) 
                        wpStar.at(i, j, k) += wg.at(m+west, n+south, q+front) * w[m + n*4 + q*4*4];
票数 3
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/250938

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档