首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >Viterbi算法在C中的实现

Viterbi算法在C中的实现
EN

Code Review用户
提问于 2017-05-18 16:40:01
回答 3查看 3.9K关注 0票数 7

这是C语言中viterbi算法的一个实现,它遵循Durbin et。S出版的“生物序列分析”(2002)一书,标题中有更多关于使用的信息,以及程序的具体操作。

它运行良好,没有内存泄漏,但我感兴趣的是如何更好地构造代码,并可能使其运行得更快。我对C的好款式还不太了解。首先,我想我的main()函数太多了,我应该把它分解成更小的函数,但是我想知道最好的方法。

我还对一种更好的文件读取方法感兴趣,因为我听说fscanf很危险,而且容易出错。还有其他错误处理我也错过了吗?

如果您想编译并运行图书示例,这里有相应的文本文件:文本文件

代码语言:javascript
复制
/**
 * Implementation of the viterbi algorithm for estimating the states of a Hidden Markov Model given at least a sequence text file.
 * Program automatically determines n value from sequence file and assumes that state file has same n value.
 * 
 * Program follows example from Durbin et. al.'s "The occasionally dishonest casino, part 1." on p. 54 of Biological Sequence
 * Analyis (2002), with solution and viterbi output given on p. 57. The two states, F and L, correspond to a "Fair" or a "Loaded" die.
 * 
 * free .pdf of Durbin at: http://dnapunctuation.org/~poptsova/course/Durbin-Et-Al-Biological-Sequence-Analysis-CUP-2002-No-OCR.pdf
 * 
 * Optional argument to read in file of known states for comparison with algorithm's output. 
 * Sequence file and state files are is assumed to be one entry per line (see .txt files for example).
 * 
 * Usage: ./viterbi my_sequence_file.txt my_state_file.txt
 * my_sequence_file.txt = sequence file (required)
 * my_state_file.txt = state file (optional)
 *
**/


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>

double max (double a, double b);
int argmax (double row0, double row1);

int main (int argc, char *argv[]) 
{
    // check for correct number of command line args
    if (argc != 2 && argc != 3)
    {
        printf("Usage: ./viterbi my_sequence_file.txt my_state_file.txt .  Include at least sequence file.\n");
        return 1;
    }

    // open sequence file and store in array. Dynamically allocate memory and automatically detect sequence n value
    FILE *seqf = fopen(argv[1], "r");
    if (!seqf)
    {
        printf("Invalid sequence file.\n");
        return 1;
    }

    int num;
    int memsize = 100;
    int n = 0;
    int *seq = calloc(memsize, sizeof(int));

    while(fscanf(seqf, "%i", &num) == 1)
    {
        seq[n] = num - 1;
        n++;

        if (n == memsize)
        {
            memsize += 100;
            seq = realloc(seq, memsize * sizeof(int));
        }

        if(!seq)
        {
            printf("Not enough memory.");
            return 1;
        }
    }
    fclose(seqf);

    // if passed as an argument, open the state solution file and print. Assumes n is same as sequence n above
    if(argv[2])
    {
        FILE *statef = fopen(argv[2], "r");

        if (!statef)
        {
            printf("Invalid state file.\n");
            return 1;
        }

        char *state = calloc(n, sizeof(char));
        if(!state)
        {
            printf("Not enough memory.");
            return 1;
        }

        char ch;
        printf("State solution:\n");
        for (int i = 0; i < n; i++)
        {
            fscanf(statef, "%c %*[\r\n]", &ch);
            state[i] = ch;

            if (i % 60 == 0 && i != 0)
            {
                printf("\n");
            }
            printf("%c", state[i]);
        }
        fclose(statef);
        free(state);
        printf("\n\n");
    }

    // state transition matrix in log space
    double a[2][2] = {
        { log(0.95),  log(0.05) },
        { log(0.1),  log(0.9) }
    };

    // emission probabilities, corresponding to p of rolling 1 thru 6 on fair or loaded die
    double e[6][2] = {
        { log( ((double) 1)/6),  log(0.1) },
        { log( ((double) 1)/6),  log(0.1) },
        { log( ((double) 1)/6),  log(0.1) },
        { log( ((double) 1)/6),  log(0.1) },
        { log( ((double) 1)/6),  log(0.1) },
        { log( ((double) 1)/6),  log(0.5) },
    };

    // allocate rest of memory and error handle
    int *path = calloc(n, sizeof(double));
    double **vprob = calloc(n, sizeof(double *));
    double **ptr = calloc(n, sizeof(double *));
    double **pi = calloc(n, sizeof(double *));

    if( !path || !vprob || !ptr || !pi )
        {
            printf("Not enough memory.");
            return 1;
        }

    for (int i = 0; i < 2; i++)
    {
        vprob[i] = calloc(n, sizeof(double));
        ptr[i] = calloc(n, sizeof(double));
        pi[i] = calloc(n, sizeof(double));

        if( !vprob[i] || !ptr[i] || !pi[i] )
        {
            printf("Not enough memory.");
            return 1;
        }
    }

    // initialize vprob array; assumed starting state is state F
    vprob[0][0] = 1;
    vprob[1][0] = 0;
    double row0;
    double row1;

    // viterbi algorithm in log space to avoid underflow
    for (int i = 1; i < n; i++)
    {
        for (int j = 0; j < 2; j++)
        {
            row0 = (vprob[0][i - 1] + a[0][j]);
            row1 = (vprob[1][i - 1] + a[1][j]);

            vprob[j][i] = e[seq[i]][j] + max( row0, row1 );
            ptr[j][i] = argmax( row0, row1 );
            pi[j][i] = max( row0 , row1 );
        }
    }
    free(seq);

    // traceback to find most likely path
    path[n - 1] = argmax( pi[0][n - 1], pi[1][n - 1] );
    for (int i = n - 2; i > 0; i--)
    {
        path[i] = ptr[path[i + 1]][i + 1];
    }

    // free remaining memory
    for (int i = 0; i < 2; i++)
    {
        free(vprob[i]);
        free(ptr[i]);
        free(pi[i]);
    }
    free(vprob);
    free(ptr);
    free(pi);

    // print viterbi result
    printf("Viterbi output:\n");
    for (int i = 0; i < n; i++)
    {   
        if (i % 60 == 0 && i != 0)
        {
            printf("\n");
        }

        if (path[i] == 0)
        {
            printf("F");
        }
        if (path[i] == 1)
        {
            printf("L");
        }
    }
    printf("\n");
    free(path);
    return 0;
}


double max (double a, double b)
{
    if (a > b)
    {
        return a;
    }
    else if (a < b)
    {
    return b;
    }
    // if equal, returns arbitrary argument for specific use in this algorithm
    return b;
}

int argmax (double row0, double row1)
{
    if (row0 > row1)
    {
        return 0;
    }
    else if (row0 < row1)
    {
        return 1;
    }
    return row1;
}
EN

回答 3

Code Review用户

发布于 2017-05-18 20:34:40

欢迎来到代码评论,非常好的第一个问题,代码审查!

您对程序结构是非常正确的,您可能需要对软件设计原则进行一些研究,如单责任原则、不要重复自己的原则(干代码)保持简单的KIS(S)原则和德米特定律

状态文件是什么?

目前,程序正在从文件中读取状态信息,但从未使用状态信息,状态数组在输入后会被丢弃。这几乎可以被认为是一个错误或坏代码。

使用stderr报告错误

当C第一次开发时,创建者创建了3个要使用的文件指针,stdinstdoutstderr。函数printf()打印到stdoutscanf()stdin读取,并创建了一个用于报告错误消息的特殊文件指针,这是stderr。最好向stderr报告错误,而不是stdout,因为progname arguments > outfile重定向文件输出不会重定向stderr。也可以使用progname arguments >& outfile (Unix和Linux)将stderr重定向到文件。

要将错误打印到stderr

代码语言:javascript
复制
    fprintf(stderr, "ERROR MESSAGE");

降低复杂度,遵循SRP

单一责任原则指出,每个模块或类都应该对软件提供的功能的单个部分负有责任,这种责任应该完全由类封装。它的所有服务都应与这一责任密切配合。

代码语言:javascript
复制
Robert C. Martin expresses the principle as follows:
    `A class should have only one reason to change.`

虽然这主要是针对面向对象语言中的类,但它也适用于像C这样的过程语言中的函数和子程序。

主要职能可分为至少4项职能:

代码语言:javascript
复制
int* resize_seq(int* seq, int* memsize);
int* process_sequence_input_file(/* in variable */ char* input_file, /* out variables */ int* out_n)
char* process_state_file(char* statefilename, int n);
int execute_biological_sequence_analysis(int *seq, int sequence_size, char *state)

函数execute_biological_sequence_analysis()也可以分解为多个函数。

下面是一个例子,说明程序遵循单一责任原则可能是什么样子。

代码语言:javascript
复制
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>

#define SEQUENCE_MEMORY_ALLOCATION_SIZE    100
#define DIE_FACE_COUNT  6

double max (double a, double b)
{
    if (a > b)
    {
        return a;
    }
    else if (a < b)
    {
        return b;
    }
    // if equal, returns arbitrary argument for specific use in this algorithm
    return b;
}

double argmax (double row0, double row1)
{
    if (row0 > row1)
    {
        return 0;
    }
    else if (row0 < row1)
    {
        return 1;
    }
    return row1;
}

int* resize_seq(int* seq, int* memsize)
{
    *memsize += SEQUENCE_MEMORY_ALLOCATION_SIZE;
    seq = realloc(seq, *memsize * sizeof(int));
    if(!seq)
    {
        fprintf(stderr, "Not enough memory to allocate or reallocate seq.");
    }
    return seq;
}

int* process_sequence_input_file(/* in variable */ char* input_file, /* out variables */ int* out_n)
{
    int memsize = SEQUENCE_MEMORY_ALLOCATION_SIZE;
    int n = 0;
    int num;

    int *seq = calloc(memsize, sizeof(int));
    if (!seq)
    {
        return NULL;
    }

    // open sequence file and store in array. Dynamically allocate memory and automatically detect sequence n value
    FILE *seqf = fopen(input_file, "r");
    if (!seqf)
    {
        fprintf(stderr, "Can't open input file %s.\n", input_file);
        free(seq);
        return NULL;
    }

    while(fscanf(seqf, "%i", &num) == 1)
    {
        seq[n] = num - 1;
        n++;

        if (n == memsize)
        {
            if (!(seq = resize_seq(seq, &memsize))) {
                return NULL;
            }
        }

    }
    fclose(seqf);

    *out_n = n;         // return n

    return seq;
}

char* process_state_file(char* statefilename, int n)
{
    FILE *statef = fopen(statefilename, "r");

    if (!statef)
    {
        printf("Invalid state file.\n");
        return NULL;
    }

    char *state = calloc(n, sizeof(char));
    if(!state)
    {
        printf("Not enough memory.");
        return NULL;
    }

    char ch;
    printf("State solution:\n");
    for (int i = 0; i < n; i++)
    {
        fscanf(statef, "%c %*[\r\n]", &ch);
        state[i] = ch;

        if (i % 60 == 0 && i != 0)
        {
            printf("\n");
        }
        printf("%c", state[i]);
    }
    fclose(statef);
    printf("\n\n");

    return state;
}

int execute_biological_sequence_analysis(int *seq, int sequence_size, char *state)
{
    int status = EXIT_SUCCESS;

    // state transition matrix in log space
    double a[2][2] = {
        { log(0.95),  log(0.05) },
        { log(0.1),  log(0.9) }
    };

    // emission probabilities, corresponding to p of rolling 1 thru 6 on fair or loaded die
    double e[DIE_FACE_COUNT][2] = {
        { log( ((double) 1)/DIE_FACE_COUNT),  log(0.1) },
        { log( ((double) 1)/DIE_FACE_COUNT),  log(0.1) },
        { log( ((double) 1)/DIE_FACE_COUNT),  log(0.1) },
        { log( ((double) 1)/DIE_FACE_COUNT),  log(0.1) },
        { log( ((double) 1)/DIE_FACE_COUNT),  log(0.1) },
        { log( ((double) 1)/DIE_FACE_COUNT),  log(0.5) },
    };

    // allocate rest of memory and error handle
    int *path = calloc(sequence_size, sizeof(double));
    double **vprob = calloc(sequence_size, sizeof(double *));
    double **ptr = calloc(sequence_size, sizeof(double *));
    double **pi = calloc(sequence_size, sizeof(double *));

    if( !path || !vprob || !ptr || !pi )
    {
        printf("Not enough memory.");
        return 1;
    }

    for (int i = 0; i < 2; i++)
    {
        vprob[i] = calloc(sequence_size, sizeof(double));
        ptr[i] = calloc(sequence_size, sizeof(double));
        pi[i] = calloc(sequence_size, sizeof(double));

        if( !vprob[i] || !ptr[i] || !pi[i] )
        {
            printf("Not enough memory.");
            return EXIT_FAILURE;
        }
    }

    // initialize vprob array; assumed starting state is state F
    vprob[0][0] = 1;
    vprob[1][0] = 0;
    double row0;
    double row1;

    // viterbi algorithm in log space to avoid underflow
    for (int i = 1; i < sequence_size; i++)
    {
        for (int j = 0; j < 2; j++)
        {
            row0 = (vprob[0][i - 1] + a[0][j]);
            row1 = (vprob[1][i - 1] + a[1][j]);

            vprob[j][i] = e[seq[i]][j] + max( row0, row1 );
            ptr[j][i] = argmax( row0, row1 );
            pi[j][i] = max( row0 , row1 );
        }
    }

    // traceback to find most likely path
    path[sequence_size - 1] = argmax( pi[0][sequence_size - 1], pi[1][sequence_size - 1] );
    for (int i = sequence_size - 2; i > 0; i--)
    {
        path[i] = ptr[path[i + 1]][i + 1];
    }

    // free remaining memory
    for (int i = 0; i < 2; i++)
    {
        free(vprob[i]);
        free(ptr[i]);
        free(pi[i]);
    }
    free(vprob);
    free(ptr);
    free(pi);

    // print viterbi result
    printf("Viterbi output:\n");
    for (int i = 0; i < sequence_size; i++)
    {
        if (i % 60 == 0 && i != 0)
        {
            printf("\n");
        }

        if (path[i] == 0)
        {
            printf("F");
        }
        if (path[i] == 1)
        {
            printf("L");
        }
    }
    printf("\n");
    free(path);

    return status;
}

int main (int argc, char *argv[])
{
    int status = EXIT_SUCCESS;
    // check for correct number of command line args
    if (argc != 2 && argc != 3)
    {
        fprintf(stderr, "Usage: ./viterbi my_sequence_file.txt my_state_file.txt .  Include at least sequence file.\n");
        return EXIT_FAILURE;
    }

    int sequence_size = 0;
    int *seq;

    if (!(seq = process_sequence_input_file(argv[1], &sequence_size)))
    {
        return EXIT_FAILURE;
    }

    char *state = NULL;
    // if passed as an argument, open the state solution file and print. Assumes n is same as sequence n above
    if(argv[2])
    {
        if (!(state = process_state_file(argv[2], sequence_size)))
        {
            return EXIT_FAILURE;
        }
    }
    status = execute_biological_sequence_analysis(seq, sequence_size, state);

    free(seq);
    free(state);

    return status;
}

使用双曲常数,而不是数字常数

有一些数字常量,如6,100,0.1,0.5可以表示为符号常量,下面是其中两个例子

代码语言:javascript
复制
#define SEQUENCE_MEMORY_ALLOCATION_SIZE    100
#define DIE_FACE_COUNT  6

在stdlib.h和stdlib中定义了2个常量,它们是EXIT_SUCCESSEXIT_FAILURE。这使得main()更易读,而不是返回0,从main返回出口_成功或退出_失败,这也将允许在将来的某个时候添加错误处理。

符号常量在许多方面帮助程序员:

  • 它们使代码更具可读性。
  • 它们减少了编辑中需要更改的行数,这使得将来更改代码更容易。
  • 当使用常量来创建数组的大小时,一次更改所有数组大小和循环常量要容易得多。

变量名

一般来说,变量名是可以的,有些可以通过额外的长度来改进,比如将seq更改为sequence。一个变量可能应该从n重命名为`sequence_size,因为它在大部分程序中都使用过。

测试分配

的返回

该代码包含所有必要的测试,以查看calloc()是否返回一个值,但如果每个对calloc()的调用都立即进行测试,而不是下面的代码所做的操作,情况会更好:

代码语言:javascript
复制
    // allocate rest of memory and error handle
    int *path = calloc(sequence_size, sizeof(double));
    double **vprob = calloc(sequence_size, sizeof(double *));
    double **ptr = calloc(sequence_size, sizeof(double *));
    double **pi = calloc(sequence_size, sizeof(double *));

    if( !path || !vprob || !ptr || !pi )
    {
        printf("Not enough memory.");
        return EXIT_FAILURE;
    }

    for (int i = 0; i < 2; i++)
    {
        vprob[i] = calloc(sequence_size, sizeof(double));
        ptr[i] = calloc(sequence_size, sizeof(double));
        pi[i] = calloc(sequence_size, sizeof(double));

        if( !vprob[i] || !ptr[i] || !pi[i] )
        {
            printf("Not enough memory.");
            return EXIT_FAILURE;
        }
    }

一种可能的解决方案是创建一个调用calloc()的函数,然后测试返回值,这将应用DRY原则(不要重复自己)。

代码语言:javascript
复制
double** calloc_with_test_and_error_message(int memsize, char *arrayName)
{
    double** arrayofdoublepointers = calloc(1, memsize);
    if (!arrayofdoublepointers)
    {
         fprintf(stderr, "ERROR: calloc failed to create the array of double pointers %s\n", arrayName);
    }
    return arrayofdoublepointers;
}
票数 3
EN

Code Review用户

发布于 2017-05-19 01:28:43

配置过大

我注意到您在这里分配了几个长度为n的数组:

//分配内存的其余部分和错误句柄int *path = calloc(n,sizeof of ( double ));double **vprob = calloc(n,sizeof of( double *));double **ptr = calloc(n,sizeof of(double *));double **pi = calloc(n,sizeof of(double *));if( !path !vprob !ptr!ptr){printf(“不够内存”);返回1;}

第一个问题是,对于path,您为n双倍而不是n ints分配了空间(很可能是错误的)。为了避免这个问题,您应该在sizeof()中使用变量本身,而不是硬编码类型。

第二个问题是,在这4个数组中,只有path需要大小为n。另外三个应该是2码。所以,实际上,你甚至不需要分配这些,你可以这样做:

代码语言:javascript
复制
// allocate rest of memory and error handle
int    *path    = calloc(n, sizeof(*path));
double *vrob[2] = {0};
double *ptr [2] = {0};
double *pi  [2] = {0};

if( !path )
{
    printf("Not enough memory.");
    return EXIT_FAILURE;
}

电位缓冲区溢出

argmax()函数在row1时返回row0 == row1,但我认为它应该返回1。稍后,argmax()的结果被用作ptr[]数组的索引,该数组只有两个条目。因此,例如,如果row0row1都是5,那么稍后就会出现缓冲区溢出。

票数 2
EN

Code Review用户

发布于 2017-05-19 17:20:08

OP的代码在很大程度上得到了很好的审查。只是加了一点

"%*[\r\n]" in fscanf(statef, "%c %*[\r\n]", &ch);没有任何用途。

" "fscanf()格式使用任何0或更多空格字符,包括\r\n.所以这些都不是"%*[\r\n]"可以使用的东西了。

此外,如果该文件以像'\n'这样的空白开头,则该字符将保存在ch中。不太可能是代码的意图。

如果没有足够的数据,代码可能使用state[i] = ch;读取未初始化的数据,这是未定义的行为(UB)。

考虑在使用fscanf()之前验证ch结果,至少检查返回值。

代码语言:javascript
复制
    char ch;
    printf("State solution:\n");
    fflush(stdout);  // Insure output is seen 
    for (int i = 0; i < n; i++) {
        // skip white-spaces 
        //                  v
        if (fscanf(statef, " %c", &ch) != 1) {
          Handle_Scant_Input();
        }
        state[i] = ch;
        ...
票数 2
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/163674

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档