我已经实施了朴素贝叶斯算法在一个410k rows.Now的大型数据集上,我所有的记录都得到了正确的分类,但问题是程序几乎需要一个小时才能将记录写入到相应的files.What中,这是提高我的code.Here性能的最佳方式是下面的code.This代码正在将410k记录写入到相应的files.Thank中。
fp=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
token = strtok(line,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token1 = strtok(token,"\n");
memcpy(mystr,&token1[0],strlen(token1)-1);
mystr[strlen(token1)-1] = '\0';
if( strcmp(mystr,"ok") == 0 )
counter_ok++;
else
counter_fraud++;
}
printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);
prblty_ok = counter_ok/(counter_ok+counter_fraud);
prblty_fraud = counter_fraud/(counter_ok+counter_fraud);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);
fclose(fp);
fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","a");
fp3=fopen("sales_unknown_fraud_classified.txt","a");
while(fgets(line1,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
unknwn_attr1 = strtok(line1,",");
unknwn_attr2 = strtok(NULL,",");
unknwn_attr3 = strtok(NULL,",");
unknwn_attr4 = strtok(NULL,",");
unknwn_attr5 = strtok(NULL,",");
//printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
fp1=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp1)!=NULL) //Reading each line from file to calculate the file size.
{
ok_fraud_attr1 = strtok(line,",");
ok_fraud_attr2 = strtok(NULL,",");
ok_fraud_attr3 = strtok(NULL,",");
ok_fraud_attr4 = strtok(NULL,",");
ok_fraud_attr5 = strtok(NULL,",");
ok_fraud_attr6 = strtok(NULL,",");
memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);
ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
//ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';
//printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str));
if( strcmp(ok_fraud_attr6_str,"ok") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_ok_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_ok_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_ok_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_ok_attr5++;
}
if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_fraud_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_fraud_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_fraud_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_fraud_attr5++;
}
}
fclose(fp1);
if(counter_ok_attr2 == 0)
prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);
if(counter_ok_attr3 == 0)
prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);
if(counter_ok_attr4 == 0)
prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);
if(counter_ok_attr5 == 0)
prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);
if(counter_fraud_attr2 == 0)
prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);
if(counter_fraud_attr3 == 0)
prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);
if(counter_fraud_attr4 == 0)
prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);
if(counter_fraud_attr5 == 0)
prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);
total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);
if(total_prblty_ok > total_prblty_fraud)
{
fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
else
{
fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}
fclose(fp);
fclose(fp2);
fclose(fp3); 发布于 2012-11-18 09:32:22
我马上就能看到你可以做的几件事,按照我会尝试的顺序:
,
strlen(),strlen()。大多数像样的优化编译器将检测未更改的源代码,并优化已知未更改的char-ptr上的后续调用,所以我会最后这样做(但老实说,我仍然会这样做,因为在与OP data.strlen()上调用重复的调用是一种糟糕的做法:您一遍又一遍地重新解析同一数据文件(sales_ok_fraud.txt),一次是针对sales_unknwn.txt中的一行数据。如果sales_ok_fraud.txt能够容纳在内存中,那么12 in /abg-line-length就是很多不必要的重复解析。加载该数据一次计算其基本统计信息一次,并将其中的数据和统计信息用于其余的数据处理。逻辑约简
你可以在一个特定的地方减少大量的工作,改变这一点:
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
counter_ok_attr5++;
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
counter_fraud_attr5++;要这样做:
if (strcmp(ok_fraud_attr6_str, "ok") == 0)
{
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
counter_ok_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_ok_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
counter_ok_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
counter_ok_attr5++;
}
else if (strcmp(ok_fraud_attr6_str,"fraud") == 0)
{
if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
counter_fraud_attr2++;
if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0)
counter_fraud_attr3++;
if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
counter_fraud_attr4++;
if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
counter_fraud_attr5++;
}Front-Loading sales_ok_fraud.txt
以下内容依赖于您的sales_ok_fraud.txt统计文件数据格式的神圣性,同时尝试在验证所述格式时尽可能地学究。它分配一个足够大的内存块来容纳整个文件,再加上一个字符,将整个正文视为一个null-term-string。然后,通过与之前相同的通用算法对该缓冲区进行拼接。结果将是一个指向固定长度字符指针数组的指针表,然后可以在当前(重复地)打开、解析、使用和丢弃所有内容的相同位置迭代使用。
// declare an array of six string pointers
typedef char *OFAttribs[6];
// loads a table consisting of the following format:
//
// str1,str2,str3,str4,str5,str6\n
// str1,str2,str3,str4,str5,str6\n
// ...
// str1,str2,str3,str4,str5,str6
//
// any deviation from the above will cause premature termination of the loop
// but will return whatever was able to be parsed up to the point of failure.
// the caller should therefore always `free()` the resulting table and data
// pointers.
size_t init_ok_fraud_data(const char *fname, OFAttribs **ppTable, char **ppTableData)
{
if (!fname || !*fname)
return 0;
// check file open for thumbs up
FILE *fp = fopen(fname, "rb");
if (!fp)
return 0;
// allocate enough memory to hold the entire file, plus a terminator
fseek(fp, 0, SEEK_END);
long len = ftell(fp);
fseek(fp, 0, SEEK_SET);
// allocate enough ram for the entire file plus terminator
OFAttribs *pTable = NULL;
size_t nTableLen = 0;
char *pTableData = malloc((len+1) * sizeof(char));
if (NULL != pTableData)
{
fread(pTableData , len, 1, fp);
pTableData[len] = 0;
}
// no longer need the file
fclose(fp);
// prime first token
char *token = strtok(pTableData, ",");
while (token)
{
// read next line of tokens
OFAttribs attribs = { NULL };
for (int i=0;i<4 && token; ++i)
{
attribs[i] = token;
token = strtok(NULL, ",");
}
// filled 0..3, set lat token and move on
if (attribs[3] && token)
{
// next-to-last entry set
attribs[4] = token;
// line enter is only terminated by newline
token = strtok(NULL, "\n");
if (token)
{
// proper format. 6 parms, 5 commas, one new-line.
attribs[5] = token;
size_t slen = strlen(token);
if (slen > 0)
{
while (isspace(token[--slen]))
token[slen] = 0;
}
// make space on the master list for another.
OFAttribs *tmp = realloc(pTable, sizeof(*tmp) * (nTableLen+1));
if (NULL != tmp)
{
pTable = tmp;
memcpy(pTable + nTableLen++, attribs, sizeof(attribs));
}
else
{ // allocation failure.
printf("Error allocating memory for expanding OKFraud data set");
exit(EXIT_FAILURE);
}
}
else
{ // not good.
printf("Invalid line format detected. Expected ok/fraud\\n");
break;
}
// next token of new line
token = strtok(NULL, ",");
}
}
// set output variables
*ppTable = pTable;
*ppTableData = pTableData;
return nTableLen;
}将其组合在一起
合并以上所有内容会对您的代码库产生以下影响:
// load the ok_fraud table ONCE.
OFAttribs *okfr = NULL;
char *okfr_data = NULL;
size_t okfr_len = init_ok_fraud_data("sales_ok_fraud.txt", &okfr, &okfr_data);
// walk table to determine probabilities of ok and fraud states.
// note: this really should be done as part of the loader.
for (size_t i=0;i<okfr_len; ++i)
{
if (0 == strcmp("ok", okfr[i][5]))
++counter_ok;
else
++counter_fraud;
}
printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);
// compute probabilites for ok and fraud states
prblty_ok = (float)counter_ok/(float)(okfr_len);
prblty_fraud = (float)counter_fraud/(float)(okfr_len);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);
fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","w");
fp3=fopen("sales_unknown_fraud_classified.txt","w");
while(fgets(line1,sizeof(line1),fp)!=NULL) //Reading each line from file to calculate the file size.
{
char *unknwn_attr1 = strtok(line1,",");
char *unknwn_attr2 = strtok(NULL,",");
char *unknwn_attr3 = strtok(NULL,",");
char *unknwn_attr4 = strtok(NULL,",");
char *unknwn_attr5 = strtok(NULL,",");
//printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
for (size_t i=0;i<okfr_len; ++i)
{
if( strcmp(okfr[i][5], "ok") == 0 )
{
// ok case
if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
counter_ok_attr2++;
if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
counter_ok_attr3++;
if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
counter_ok_attr4++;
if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
counter_ok_attr5++;
}
else // fraud case
{
if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
counter_fraud_attr2++;
if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
counter_fraud_attr3++;
if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
counter_fraud_attr4++;
if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
counter_fraud_attr5++;
}
}
if(counter_ok_attr2 == 0)
prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);
if(counter_ok_attr3 == 0)
prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);
if(counter_ok_attr4 == 0)
prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);
if (counter_ok_attr5 == 0)
prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);
if(counter_fraud_attr2 == 0)
prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);
if(counter_fraud_attr3 == 0)
prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);
if(counter_fraud_attr4 == 0)
prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);
if(counter_fraud_attr5 == 0)
prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);
total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);
if(total_prblty_ok > total_prblty_fraud)
{
fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
else
{
fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}
// free the table data and dynamic pointer array
free(okfr);
free(okfr_data);
fclose(fp);
fclose(fp2);
fclose(fp3);
return 0;这些只是一些想法。当然还有更多的东西,但这些东西应该会对处理文件的单次向前扫描和连续输出有很大的帮助,在这种情况下,这是你将获得的最高效率。毫无疑问,sales_ok_fraud.txt文件的三大特性:单文件open+close、逻辑缩减和单解析缓存的组合将极大地提高sales_ok_fraud.txt文件的性能,尤其是其中的第一个和最后一个。
EDIT帮助OP更新此处理器以预先加载sales_ok_fraud.txt文件内容,从而消除重复加载、解析,并迅速抛出要重复解析的某些15000+文本行(每个主源代码输入行一次)。以上答案已相应更新。
发布于 2012-11-18 09:09:11
@m02ph3u5是对的。让你的文件保持打开状态,把对fopen和fclose的调用排除在循环之外。
inputFile = fopen("sales_unknwn.txt","r");
okayFile = fopen("sales_ok_fraud.txt","r");
unknownOkayFile = fopen("sales_unknown_ok_classified.txt","a");
unknownFraudFile = fopen("sales_unknown_fraud_classified.txt","a");
// your loops go here
fclose(inputFile);
fclose(okayFile);
fclose(unknownOkayFile);
fclose(unknownFraudFile);如果它仍然很慢,那么在你的应用上运行一个采样分析器,将测试数据的子集作为输入,以保持快速的周转。这会告诉你程序把时间花在哪里了。你可能会感到惊讶。如果您不知道要使用分析器,可以通过使用调试器运行您的应用程序,反复进入调试器并注意它在哪个函数中运行,来做一个穷人的采样分析器模拟。
发布于 2012-11-18 09:24:55
以下是一些建议:
·重复打开要追加的文件,然后关闭它们,然后重新打开它们,这是非常昂贵的。这是因为I/O比内存访问慢得多,而且每次写入时都会强制磁盘打开每个文件并查找到最后。最好在开始时打开它们一次,在结束时关闭它们,除非你担心程序会崩溃,你会丢失到目前为止已经写好的数据。
·您可以替换这些行
memcpy(ok_fraud_attr6_str, &ok_fraud_attr6[0], strlen(ok_fraud_attr6)-2);
ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';使用
ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';然后在测试中使用ok_fraud_attr6。由于strtok是破坏性的(快速搜索值得您花时间了解为什么使用它通常是一个坏主意),您不必担心保留line或ok_fraud_attr6的内容。
·当您发现自己一遍又一遍地编写相同的代码时,通常会发现您的算法效率低下。而不是
if ((some_unique_test) && (a_common_test))
do_some_stuff;
if ((some_other_unique_test) && (a_common_test))
do_other_stuff;你可以写
if (a_common_test) {
if (some_unique_test)
do_some_stuff;
if (some_other_unique_test)
do_other_stuff;
}然而,请注意,只有第一个建议可能会对程序的执行时间产生明显的影响,尽管它们都是值得学习的好习惯。
Jason关于使用分析器的建议是非常好的建议,无论怎么强调都不过分。程序员--即使是有经验的程序员--在预测代码中的瓶颈在哪里是出了名的糟糕。除了调试器之外,概要文件是您最好的朋友。
https://stackoverflow.com/questions/13436307
复制相似问题