这是C语言中viterbi算法的一个实现,它遵循Durbin et。S出版的“生物序列分析”(2002)一书,标题中有更多关于使用的信息,以及程序的具体操作。
它运行良好,没有内存泄漏,但我感兴趣的是如何更好地构造代码,并可能使其运行得更快。我对C的好款式还不太了解。首先,我想我的main()函数太多了,我应该把它分解成更小的函数,但是我想知道最好的方法。
我还对一种更好的文件读取方法感兴趣,因为我听说fscanf很危险,而且容易出错。还有其他错误处理我也错过了吗?
如果您想编译并运行图书示例,这里有相应的文本文件:文本文件
/**
* Implementation of the viterbi algorithm for estimating the states of a Hidden Markov Model given at least a sequence text file.
* Program automatically determines n value from sequence file and assumes that state file has same n value.
*
* Program follows example from Durbin et. al.'s "The occasionally dishonest casino, part 1." on p. 54 of Biological Sequence
* Analyis (2002), with solution and viterbi output given on p. 57. The two states, F and L, correspond to a "Fair" or a "Loaded" die.
*
* free .pdf of Durbin at: http://dnapunctuation.org/~poptsova/course/Durbin-Et-Al-Biological-Sequence-Analysis-CUP-2002-No-OCR.pdf
*
* Optional argument to read in file of known states for comparison with algorithm's output.
* Sequence file and state files are is assumed to be one entry per line (see .txt files for example).
*
* Usage: ./viterbi my_sequence_file.txt my_state_file.txt
* my_sequence_file.txt = sequence file (required)
* my_state_file.txt = state file (optional)
*
**/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
double max (double a, double b);
int argmax (double row0, double row1);
int main (int argc, char *argv[])
{
// check for correct number of command line args
if (argc != 2 && argc != 3)
{
printf("Usage: ./viterbi my_sequence_file.txt my_state_file.txt . Include at least sequence file.\n");
return 1;
}
// open sequence file and store in array. Dynamically allocate memory and automatically detect sequence n value
FILE *seqf = fopen(argv[1], "r");
if (!seqf)
{
printf("Invalid sequence file.\n");
return 1;
}
int num;
int memsize = 100;
int n = 0;
int *seq = calloc(memsize, sizeof(int));
while(fscanf(seqf, "%i", &num) == 1)
{
seq[n] = num - 1;
n++;
if (n == memsize)
{
memsize += 100;
seq = realloc(seq, memsize * sizeof(int));
}
if(!seq)
{
printf("Not enough memory.");
return 1;
}
}
fclose(seqf);
// if passed as an argument, open the state solution file and print. Assumes n is same as sequence n above
if(argv[2])
{
FILE *statef = fopen(argv[2], "r");
if (!statef)
{
printf("Invalid state file.\n");
return 1;
}
char *state = calloc(n, sizeof(char));
if(!state)
{
printf("Not enough memory.");
return 1;
}
char ch;
printf("State solution:\n");
for (int i = 0; i < n; i++)
{
fscanf(statef, "%c %*[\r\n]", &ch);
state[i] = ch;
if (i % 60 == 0 && i != 0)
{
printf("\n");
}
printf("%c", state[i]);
}
fclose(statef);
free(state);
printf("\n\n");
}
// state transition matrix in log space
double a[2][2] = {
{ log(0.95), log(0.05) },
{ log(0.1), log(0.9) }
};
// emission probabilities, corresponding to p of rolling 1 thru 6 on fair or loaded die
double e[6][2] = {
{ log( ((double) 1)/6), log(0.1) },
{ log( ((double) 1)/6), log(0.1) },
{ log( ((double) 1)/6), log(0.1) },
{ log( ((double) 1)/6), log(0.1) },
{ log( ((double) 1)/6), log(0.1) },
{ log( ((double) 1)/6), log(0.5) },
};
// allocate rest of memory and error handle
int *path = calloc(n, sizeof(double));
double **vprob = calloc(n, sizeof(double *));
double **ptr = calloc(n, sizeof(double *));
double **pi = calloc(n, sizeof(double *));
if( !path || !vprob || !ptr || !pi )
{
printf("Not enough memory.");
return 1;
}
for (int i = 0; i < 2; i++)
{
vprob[i] = calloc(n, sizeof(double));
ptr[i] = calloc(n, sizeof(double));
pi[i] = calloc(n, sizeof(double));
if( !vprob[i] || !ptr[i] || !pi[i] )
{
printf("Not enough memory.");
return 1;
}
}
// initialize vprob array; assumed starting state is state F
vprob[0][0] = 1;
vprob[1][0] = 0;
double row0;
double row1;
// viterbi algorithm in log space to avoid underflow
for (int i = 1; i < n; i++)
{
for (int j = 0; j < 2; j++)
{
row0 = (vprob[0][i - 1] + a[0][j]);
row1 = (vprob[1][i - 1] + a[1][j]);
vprob[j][i] = e[seq[i]][j] + max( row0, row1 );
ptr[j][i] = argmax( row0, row1 );
pi[j][i] = max( row0 , row1 );
}
}
free(seq);
// traceback to find most likely path
path[n - 1] = argmax( pi[0][n - 1], pi[1][n - 1] );
for (int i = n - 2; i > 0; i--)
{
path[i] = ptr[path[i + 1]][i + 1];
}
// free remaining memory
for (int i = 0; i < 2; i++)
{
free(vprob[i]);
free(ptr[i]);
free(pi[i]);
}
free(vprob);
free(ptr);
free(pi);
// print viterbi result
printf("Viterbi output:\n");
for (int i = 0; i < n; i++)
{
if (i % 60 == 0 && i != 0)
{
printf("\n");
}
if (path[i] == 0)
{
printf("F");
}
if (path[i] == 1)
{
printf("L");
}
}
printf("\n");
free(path);
return 0;
}
double max (double a, double b)
{
if (a > b)
{
return a;
}
else if (a < b)
{
return b;
}
// if equal, returns arbitrary argument for specific use in this algorithm
return b;
}
int argmax (double row0, double row1)
{
if (row0 > row1)
{
return 0;
}
else if (row0 < row1)
{
return 1;
}
return row1;
}发布于 2017-05-18 20:34:40
欢迎来到代码评论,非常好的第一个问题,代码审查!
您对程序结构是非常正确的,您可能需要对软件设计原则进行一些研究,如单责任原则、不要重复自己的原则(干代码)、保持简单的KIS(S)原则和德米特定律。
目前,程序正在从文件中读取状态信息,但从未使用状态信息,状态数组在输入后会被丢弃。这几乎可以被认为是一个错误或坏代码。
当C第一次开发时,创建者创建了3个要使用的文件指针,stdin、stdout和stderr。函数printf()打印到stdout,scanf()从stdin读取,并创建了一个用于报告错误消息的特殊文件指针,这是stderr。最好向stderr报告错误,而不是stdout,因为progname arguments > outfile重定向文件输出不会重定向stderr。也可以使用progname arguments >& outfile (Unix和Linux)将stderr重定向到文件。
要将错误打印到stderr:
fprintf(stderr, "ERROR MESSAGE");单一责任原则指出,每个模块或类都应该对软件提供的功能的单个部分负有责任,这种责任应该完全由类封装。它的所有服务都应与这一责任密切配合。
Robert C. Martin expresses the principle as follows:
`A class should have only one reason to change.`虽然这主要是针对面向对象语言中的类,但它也适用于像C这样的过程语言中的函数和子程序。
主要职能可分为至少4项职能:
int* resize_seq(int* seq, int* memsize);
int* process_sequence_input_file(/* in variable */ char* input_file, /* out variables */ int* out_n)
char* process_state_file(char* statefilename, int n);
int execute_biological_sequence_analysis(int *seq, int sequence_size, char *state)函数execute_biological_sequence_analysis()也可以分解为多个函数。
下面是一个例子,说明程序遵循单一责任原则可能是什么样子。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#define SEQUENCE_MEMORY_ALLOCATION_SIZE 100
#define DIE_FACE_COUNT 6
double max (double a, double b)
{
if (a > b)
{
return a;
}
else if (a < b)
{
return b;
}
// if equal, returns arbitrary argument for specific use in this algorithm
return b;
}
double argmax (double row0, double row1)
{
if (row0 > row1)
{
return 0;
}
else if (row0 < row1)
{
return 1;
}
return row1;
}
int* resize_seq(int* seq, int* memsize)
{
*memsize += SEQUENCE_MEMORY_ALLOCATION_SIZE;
seq = realloc(seq, *memsize * sizeof(int));
if(!seq)
{
fprintf(stderr, "Not enough memory to allocate or reallocate seq.");
}
return seq;
}
int* process_sequence_input_file(/* in variable */ char* input_file, /* out variables */ int* out_n)
{
int memsize = SEQUENCE_MEMORY_ALLOCATION_SIZE;
int n = 0;
int num;
int *seq = calloc(memsize, sizeof(int));
if (!seq)
{
return NULL;
}
// open sequence file and store in array. Dynamically allocate memory and automatically detect sequence n value
FILE *seqf = fopen(input_file, "r");
if (!seqf)
{
fprintf(stderr, "Can't open input file %s.\n", input_file);
free(seq);
return NULL;
}
while(fscanf(seqf, "%i", &num) == 1)
{
seq[n] = num - 1;
n++;
if (n == memsize)
{
if (!(seq = resize_seq(seq, &memsize))) {
return NULL;
}
}
}
fclose(seqf);
*out_n = n; // return n
return seq;
}
char* process_state_file(char* statefilename, int n)
{
FILE *statef = fopen(statefilename, "r");
if (!statef)
{
printf("Invalid state file.\n");
return NULL;
}
char *state = calloc(n, sizeof(char));
if(!state)
{
printf("Not enough memory.");
return NULL;
}
char ch;
printf("State solution:\n");
for (int i = 0; i < n; i++)
{
fscanf(statef, "%c %*[\r\n]", &ch);
state[i] = ch;
if (i % 60 == 0 && i != 0)
{
printf("\n");
}
printf("%c", state[i]);
}
fclose(statef);
printf("\n\n");
return state;
}
int execute_biological_sequence_analysis(int *seq, int sequence_size, char *state)
{
int status = EXIT_SUCCESS;
// state transition matrix in log space
double a[2][2] = {
{ log(0.95), log(0.05) },
{ log(0.1), log(0.9) }
};
// emission probabilities, corresponding to p of rolling 1 thru 6 on fair or loaded die
double e[DIE_FACE_COUNT][2] = {
{ log( ((double) 1)/DIE_FACE_COUNT), log(0.1) },
{ log( ((double) 1)/DIE_FACE_COUNT), log(0.1) },
{ log( ((double) 1)/DIE_FACE_COUNT), log(0.1) },
{ log( ((double) 1)/DIE_FACE_COUNT), log(0.1) },
{ log( ((double) 1)/DIE_FACE_COUNT), log(0.1) },
{ log( ((double) 1)/DIE_FACE_COUNT), log(0.5) },
};
// allocate rest of memory and error handle
int *path = calloc(sequence_size, sizeof(double));
double **vprob = calloc(sequence_size, sizeof(double *));
double **ptr = calloc(sequence_size, sizeof(double *));
double **pi = calloc(sequence_size, sizeof(double *));
if( !path || !vprob || !ptr || !pi )
{
printf("Not enough memory.");
return 1;
}
for (int i = 0; i < 2; i++)
{
vprob[i] = calloc(sequence_size, sizeof(double));
ptr[i] = calloc(sequence_size, sizeof(double));
pi[i] = calloc(sequence_size, sizeof(double));
if( !vprob[i] || !ptr[i] || !pi[i] )
{
printf("Not enough memory.");
return EXIT_FAILURE;
}
}
// initialize vprob array; assumed starting state is state F
vprob[0][0] = 1;
vprob[1][0] = 0;
double row0;
double row1;
// viterbi algorithm in log space to avoid underflow
for (int i = 1; i < sequence_size; i++)
{
for (int j = 0; j < 2; j++)
{
row0 = (vprob[0][i - 1] + a[0][j]);
row1 = (vprob[1][i - 1] + a[1][j]);
vprob[j][i] = e[seq[i]][j] + max( row0, row1 );
ptr[j][i] = argmax( row0, row1 );
pi[j][i] = max( row0 , row1 );
}
}
// traceback to find most likely path
path[sequence_size - 1] = argmax( pi[0][sequence_size - 1], pi[1][sequence_size - 1] );
for (int i = sequence_size - 2; i > 0; i--)
{
path[i] = ptr[path[i + 1]][i + 1];
}
// free remaining memory
for (int i = 0; i < 2; i++)
{
free(vprob[i]);
free(ptr[i]);
free(pi[i]);
}
free(vprob);
free(ptr);
free(pi);
// print viterbi result
printf("Viterbi output:\n");
for (int i = 0; i < sequence_size; i++)
{
if (i % 60 == 0 && i != 0)
{
printf("\n");
}
if (path[i] == 0)
{
printf("F");
}
if (path[i] == 1)
{
printf("L");
}
}
printf("\n");
free(path);
return status;
}
int main (int argc, char *argv[])
{
int status = EXIT_SUCCESS;
// check for correct number of command line args
if (argc != 2 && argc != 3)
{
fprintf(stderr, "Usage: ./viterbi my_sequence_file.txt my_state_file.txt . Include at least sequence file.\n");
return EXIT_FAILURE;
}
int sequence_size = 0;
int *seq;
if (!(seq = process_sequence_input_file(argv[1], &sequence_size)))
{
return EXIT_FAILURE;
}
char *state = NULL;
// if passed as an argument, open the state solution file and print. Assumes n is same as sequence n above
if(argv[2])
{
if (!(state = process_state_file(argv[2], sequence_size)))
{
return EXIT_FAILURE;
}
}
status = execute_biological_sequence_analysis(seq, sequence_size, state);
free(seq);
free(state);
return status;
}。
有一些数字常量,如6,100,0.1,0.5可以表示为符号常量,下面是其中两个例子
#define SEQUENCE_MEMORY_ALLOCATION_SIZE 100
#define DIE_FACE_COUNT 6在stdlib.h和stdlib中定义了2个常量,它们是EXIT_SUCCESS和EXIT_FAILURE。这使得main()更易读,而不是返回0,从main返回出口_成功或退出_失败,这也将允许在将来的某个时候添加错误处理。
符号常量在许多方面帮助程序员:
一般来说,变量名是可以的,有些可以通过额外的长度来改进,比如将seq更改为sequence。一个变量可能应该从n重命名为`sequence_size,因为它在大部分程序中都使用过。
的返回
该代码包含所有必要的测试,以查看calloc()是否返回一个值,但如果每个对calloc()的调用都立即进行测试,而不是下面的代码所做的操作,情况会更好:
// allocate rest of memory and error handle
int *path = calloc(sequence_size, sizeof(double));
double **vprob = calloc(sequence_size, sizeof(double *));
double **ptr = calloc(sequence_size, sizeof(double *));
double **pi = calloc(sequence_size, sizeof(double *));
if( !path || !vprob || !ptr || !pi )
{
printf("Not enough memory.");
return EXIT_FAILURE;
}
for (int i = 0; i < 2; i++)
{
vprob[i] = calloc(sequence_size, sizeof(double));
ptr[i] = calloc(sequence_size, sizeof(double));
pi[i] = calloc(sequence_size, sizeof(double));
if( !vprob[i] || !ptr[i] || !pi[i] )
{
printf("Not enough memory.");
return EXIT_FAILURE;
}
}一种可能的解决方案是创建一个调用calloc()的函数,然后测试返回值,这将应用DRY原则(不要重复自己)。
double** calloc_with_test_and_error_message(int memsize, char *arrayName)
{
double** arrayofdoublepointers = calloc(1, memsize);
if (!arrayofdoublepointers)
{
fprintf(stderr, "ERROR: calloc failed to create the array of double pointers %s\n", arrayName);
}
return arrayofdoublepointers;
}发布于 2017-05-19 01:28:43
我注意到您在这里分配了几个长度为n的数组:
//分配内存的其余部分和错误句柄int *path = calloc(n,sizeof of ( double ));double **vprob = calloc(n,sizeof of( double *));double **ptr = calloc(n,sizeof of(double *));double **pi = calloc(n,sizeof of(double *));if( !path !vprob !ptr!ptr){printf(“不够内存”);返回1;}
第一个问题是,对于path,您为n双倍而不是n ints分配了空间(很可能是错误的)。为了避免这个问题,您应该在sizeof()中使用变量本身,而不是硬编码类型。
第二个问题是,在这4个数组中,只有path需要大小为n。另外三个应该是2码。所以,实际上,你甚至不需要分配这些,你可以这样做:
// allocate rest of memory and error handle
int *path = calloc(n, sizeof(*path));
double *vrob[2] = {0};
double *ptr [2] = {0};
double *pi [2] = {0};
if( !path )
{
printf("Not enough memory.");
return EXIT_FAILURE;
}argmax()函数在row1时返回row0 == row1,但我认为它应该返回1。稍后,argmax()的结果被用作ptr[]数组的索引,该数组只有两个条目。因此,例如,如果row0和row1都是5,那么稍后就会出现缓冲区溢出。
发布于 2017-05-19 17:20:08
OP的代码在很大程度上得到了很好的审查。只是加了一点
"%*[\r\n]" in fscanf(statef, "%c %*[\r\n]", &ch);没有任何用途。
" "以fscanf()格式使用任何0或更多空格字符,包括\r和\n.所以这些都不是"%*[\r\n]"可以使用的东西了。
此外,如果该文件以像'\n'这样的空白开头,则该字符将保存在ch中。不太可能是代码的意图。
如果没有足够的数据,代码可能使用state[i] = ch;读取未初始化的数据,这是未定义的行为(UB)。
考虑在使用fscanf()之前验证ch结果,至少检查返回值。
char ch;
printf("State solution:\n");
fflush(stdout); // Insure output is seen
for (int i = 0; i < n; i++) {
// skip white-spaces
// v
if (fscanf(statef, " %c", &ch) != 1) {
Handle_Scant_Input();
}
state[i] = ch;
...https://codereview.stackexchange.com/questions/163674
复制相似问题