我今天把这个C程序放在一起来处理生物信息学数据处理任务。这个程序似乎工作正常,但我想知道是否有人建议如何解析输入数据,以及我如何在主处理循环中使用控制结构。
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BUFFERSIZE 1024
FILE *fileopen(const char *filename, const char *mode)
{
FILE *fh = fopen(filename, mode);
if(fh == NULL)
{
fprintf(stderr, "error: unable to open file '%s'\n", filename);
exit(1);
}
return fh;
}
void *memalloc(size_t size)
{
void *memory = malloc(size);
if(memory == NULL)
{
fprintf(stderr, "error: unable to allocate memory\n");
exit(1);
}
return memory;
}
int main(int argc, const char **argv)
{
// Parse command line arguments
char seqbuffer1[BUFFERSIZE];
char seqbuffer2[BUFFERSIZE];
char qualbuffer1[BUFFERSIZE];
char qualbuffer2[BUFFERSIZE];
char *seqbufferout;
FILE *seqinfile;
FILE *seqoutfile;
FILE *qualinfile;
FILE *qualoutfile;
int seqlength;
if(argc != 6)
{
fprintf(stderr, "error: 5 arguments required, %d provided\n", argc - 1);
exit(1);
}
seqinfile = fileopen(argv[1], "r");
qualinfile = fileopen(argv[2], "r");
seqoutfile = fileopen(argv[3], "w");
qualoutfile = fileopen(argv[4], "w");
seqlength = atoi(argv[5]);
assert(seqlength > 0);
// Process reads
seqbufferout = (char *)memalloc( sizeof(char) * (seqlength + 1) );
seqbufferout[seqlength] = '\0';
while(fgets(seqbuffer1, BUFFERSIZE, seqinfile))
{
if(strlen(seqbuffer1) > 0)
{
int ambiguous;
int i;
assert(strncmp(seqbuffer1, ">", 1) == 0);
if(fgets(seqbuffer2, BUFFERSIZE, seqinfile) == NULL)
{
fprintf(stderr, "error: file ends with a fasta header\n");
exit(1);
}
if( fgets(qualbuffer1, BUFFERSIZE, qualinfile) == NULL ||
fgets(qualbuffer2, BUFFERSIZE, qualinfile) == NULL )
{
fprintf(stderr, "error: no quality entry corresponding to sequence '%s'\n", seqbuffer1);
exit(1);
}
ambiguous = 0;
for(i = 0; i < seqlength; i++)
{
if(seqbuffer2[i] == '.')
{
ambiguous = 1;
break;
}
seqbufferout[i] = seqbuffer2[i];
}
if(!ambiguous)
{
fprintf(seqoutfile, "%s%s\n", seqbuffer1, seqbufferout);
fputs(qualbuffer1, qualoutfile);
i = 0;
char *tok = strtok(qualbuffer2, " ");
while(tok != NULL && i < seqlength - 1)
{
if(i > 0)
fputs(" ", qualoutfile);
fputs(tok, qualoutfile);
tok = strtok(NULL, " ");
i++;
}
fputs("\n", qualoutfile);
}
}
}
fclose(seqinfile);
fclose(qualinfile);
fclose(seqoutfile);
fclose(qualoutfile);
free(seqbufferout);
return 0;
}发布于 2012-05-24 14:29:46
与通过执行if(strlen(seqbuffer1) > 0)生成的大缩进块不同的是,do:if (strlen(seqbuffer1) <1) continue;,这样整个循环的其余部分就可以超出一个级别,从而使其更具可读性。
类似地,您也可以将if(!ambiguous)转换为if (ambiguous) continue;,并输出该块。
根据您使用的C语言版本,模棱两可可能是boolean而不是int。
与fget()的结果相比,您的空比较可能更一致一些-- while()条件中的那个是隐式的!= NULL,但是if()语句中的那个是显式的== NULL。在任何地方你都应该是显式的,或者是隐式的--这两种选择都很好,但是混合风格会让你的代码更加不清晰。
assert(strncmp(seqbuffer1, ">", 1) == 0);和更容易理解的assert(seqbuffer1[0] == '>')是一样的,对吗?
另外,您至少有一个潜在的错误:如果BUFFERSIZE比输入行长度小,然后seqlength > BUFFERSIZE呢?
发布于 2012-05-25 14:22:32
一般性意见:
perror而不是fprintfmain()
seqbufferout和memalloc()是冗余的。memalloc的调用应该省略sizeof(char),默认情况下这是1,并且应该省略强制转换。sizeof buf而不是BUFFERSIZEfgets(seqbuffer1...) != NULL和strlen(seqbuffer1) == 0是一个可能的组合吗?我不这么认为--在这种情况下,斯特伦是多余的seqbuffer1[0] == '>'上断言,而不是在strcmp上strchr会更好seqbuffer2复制到seqbufferout似乎是多余的,因为您只需要打印seqbufferout的内容qualbuffer2字符串中剥离前尾空格和多个空格--对吗?ambiguous = 0的内容和包含的内容变成: if (!strchr(seqbuffer2,'.')) { fprintf(seqoutfile,"%s%s\n",seqbuffer1,seqbuffer2);fput(qualbuffer1,qualoutfile);fput(strip_spaces(Qualbuffer2),qualoutfile);fput(“n”,质量文件);}https://codereview.stackexchange.com/questions/12016
复制相似问题