文章/答案/技术大牛

发布

社区首页 >问答首页 >这是写雷克萨斯的好方法吗？

问这是写雷克萨斯的好方法吗？
EN

Code Review用户

提问于 2019-12-22 18:20:33

回答 1查看 228关注 0票数 4

我不知道这里是否适合问这个问题。

我是C语言的半初学者，我一直想建立自己的编程语言。在这里，我完全自己建立了一个词汇分析器。lexer将源代码分解为由字符串、字符、标识符、常量和特殊符号组成的标记。它也忽略了单行和多行注释。它可以对自己的代码进行词汇分析。

lexer.c

#include<stdio.h>
#include<stdlib.h>
#include<string.h>

#define SYMBOL(term) (term>=123&&term<=126)||(term>=33&&term<=47)||(term>=58&&term<=64)||(term>=91&&term<=96)
#define CAPALPHA(term) (term>='A'&&term<='Z')
#define LOWALPHA(term) (term>='a'&&term<='z')
#define ALPHA(term) ((CAPALPHA(term))||(LOWALPHA(term)))
#define WHITESPACE(term) ((term>=0&&term<=32)||term==127||term=='\n')
#define NUMBER(term) (term>='0'&&term<='9')
#define HEXNUMBER(term) ((term>='A'&&term<='f')||(term>='A'||term<='f'))

#define KEYWORD_COUNT 0

enum type {identifier,string,spsymbol,keyword,character,number,hexnumber};
char type2[25][25]={"identifier\0","string\0","symbol\0","keyword\0","character\0","number\0","hexnumber\0"};

enum keywords {regx,regy};
char keywords2[25][25]={"regx\0","regy\0"};

struct Tokens{ //Tokens structure
    char *t; //actual token
    int tlen; //token length
    int ttype; //token type
    int lineno; //token line no
    int keyword;
};

struct Tokens *token[100000];
int main(){
    char *input=malloc(100000*sizeof(char));
    FILE *fp = fopen("file", "r");
    FILE *of = fopen("lexout","w");
    char symbol;
    if(fp != NULL){
        int j=0;
        while(1)
        {   symbol = fgetc(fp);
            //printf("%c",symbol);
            if (symbol != EOF)
                input[j++]=symbol;
            else {
                input[j++]=symbol;
                break;
            }
        }
        fclose(fp);
    }

    int c=-1;   //file current character
    int current_token=0; //current_token counter
    int line=1; //current line number

    int halt=0;
    while(1){
        if(halt) break;
        token[current_token]=(struct Tokens*)malloc(sizeof(struct Tokens)); //allocate memory for token structure
        token[current_token]->t=(char*)malloc(sizeof(char)*30); //allocate memory for token size

        int tokenTypeSet=0;
        int in=0;   //structure token counter

        while(1){
            c++;
            if(input[c]=='\n') line++;
            /*detect end of file*/
            if(input[c]=='%'&&input[c+1]=='E'&&input[c+2]=='O'&&input[c+3]=='F'&&input[c+4]=='%'){
                halt=1;
                break;
            }
            /*identify singleline comments*/
            if(input[c]=='/'&&input[c+1]=='/'){
                while(input[c+1]!='\n'){
                    c++;
                }
                continue;
            }
            /*identify multiline comments*/
            if(input[c]=='/'&&input[c+1]=='*'){
                c++;
                c++;
                while(input[c]!='*'&&input[c+1]!='/'){
                    if(input[c]=='\n'){
                        line++;
                    }
                    c++;
                }
                c++;
                continue;
            }
            /*identify string*/
            if(input[c]=='"'){
                //identify token type
                if(!tokenTypeSet){
                    token[current_token]->ttype=string;
                    tokenTypeSet=1;
                }
                c++;
                while(1){
                    //newline
                    if(input[c]=='\n'){
                        c++;
                        line++;
                    }
                    //backslash escaped
                    if(input[c]=='\\'&&input[c+1]=='\\'){
                        token[current_token]->t[in++]=input[c];
                        c++;
                        token[current_token]->t[in++]=input[c];
                        c++;
                        continue;
                    }
                    //double-quote escaped
                    if(input[c]=='\\'&&input[c+1]=='"'){
                        token[current_token]->t[in++]=input[c];
                        c++;
                        token[current_token]->t[in++]=input[c];
                        c++;
                        continue;
                    }
                    //contiguous double-quotes
                    if(input[c]=='"'&&input[c+1]=='"'){
                        token[current_token]->t[in++]=input[c];
                        c++;
                        token[current_token]->t[in++]=input[c];
                        c++;
                        continue;
                    }
                    //terminate string
                    if(input[c]=='"'){
                        //c++;
                        break;
                    }
                    token[current_token]->t[in++]=input[c];
                    c++;
                }
                token[current_token]->t[in]='\0';
                break;
            }
            /*identify characters*/
            if(input[c]=='\''){
                //identify token type
                if(!tokenTypeSet){
                    token[current_token]->ttype=character;
                    tokenTypeSet=1;
                }
                c++;
                while(1){
                    //backslash escaped
                    if(input[c]=='\\'&&input[c+1]=='\\'){
                        token[current_token]->t[in++]=input[c];
                        c++;
                        token[current_token]->t[in++]=input[c];
                        c++;
                        continue;
                    }
                    //sigle-quote escaped
                    if(input[c]=='\\'&&input[c+1]=='\''){
                        token[current_token]->t[in++]=input[c];
                        c++;
                        token[current_token]->t[in++]=input[c];
                        c++;
                        continue;
                    }
                    //contiguous single-quotes
                    if(input[c]=='\''&&input[c+1]=='\''){
                        token[current_token]->t[in++]=input[c];
                        c++;
                        token[current_token]->t[in++]=input[c];
                        c++;
                        continue;
                    }
                    //terminate character
                    if(input[c]=='\''){
                        //c++;
                        break;
                    } 
                    token[current_token]->t[in++]=input[c];
                    c++;
                }
                token[current_token]->t[in]='\0';
                break;
            }
            /*mark hexnumbers*/
            if(!tokenTypeSet){
                if(input[c]=='0'&&(input[c+1]=='x'||input[c+1]=='X')){
                    token[current_token]->t[in++]=input[c];
                    c++;
                    token[current_token]->t[in++]=input[c];
                    c++;
                    token[current_token]->ttype=hexnumber;
                    tokenTypeSet=1;
                }
                //continue;
            }
            /*read hex numbers*/
            if(token[current_token]->ttype==hexnumber){
                if(HEXNUMBER(input[c])||NUMBER(input[c])){
                    token[current_token]->t[in++]=input[c];
                }
                if(WHITESPACE(input[c+1])){
                    token[current_token]->t[in]='\0';
                    //c++; //undo
                    break;
                }
                if(SYMBOL(input[c+1])){
                    token[current_token]->t[in]='\0';
                    break;
                }
                if(!(HEXNUMBER(input[c+1])||NUMBER(input[c+1]))){
                    token[current_token]->t[in]='\0';
                    break;
                }
                continue;
            }
            /*read an alphabet or an underscore*/
            if(ALPHA(input[c])||input[c]=='_'){
                //identify token type
                if(!tokenTypeSet){
                    token[current_token]->ttype=identifier;
                    tokenTypeSet=1;
                }
                token[current_token]->t[in++]=input[c];
                //detect end of token
                if(WHITESPACE(input[c+1])){
                    token[current_token]->t[in]='\0';
                    //c++; //undo
                    break;
                }
                if((SYMBOL(input[c+1]))&&(input[c+1]!='_')){
                    token[current_token]->t[in]='\0';
                    break;
                }
                continue;
            }
            /*read a symbol*/
            if(SYMBOL(input[c])){
                if(!tokenTypeSet){
                    token[current_token]->ttype=spsymbol;
                    tokenTypeSet=1;
                }
                token[current_token]->t[in++]=input[c];
                //detect end of token
                if(WHITESPACE(input[c+1])){
                    token[current_token]->t[in]='\0';
                    //c++; //undo
                    break;
                }
                if(SYMBOL(input[c+1])||ALPHA(input[c+1])||NUMBER(input[c+1])){
                    token[current_token]->t[in]='\0';
                    break;
                }
                continue;
            }
            /*read a number*/
            if(NUMBER(input[c])){
                //identify token type
                if(!tokenTypeSet){
                    token[current_token]->ttype=number;
                    tokenTypeSet=1;
                }
                token[current_token]->t[in++]=input[c];
                //detect end of token
                if(WHITESPACE(input[c+1])){
                    token[current_token]->t[in]='\0';
                    //c++; //undo
                    break;
                }
                if(!(NUMBER(input[c+1]))){
                    if(token[current_token]->ttype==identifier&&(ALPHA(input[c+1])||input[c+1]=='_')){
                        continue;
                    }
                    token[current_token]->t[in]='\0';
                    break;
                }
                /*if(SYMBOL(input[c+1])||ALPHA(input[c+1])){
                    token[current_token]->t[in]='\0';
                    break;
                }*/
                continue;
            }
        }
        token[current_token]->lineno=line;
        current_token++;
    }

    //DEBUGGER
    //printf("Total nos of tokens = %d\n\n",current_token-1);
    fprintf(of,"Total nos of tokens = %d\n\n",current_token-1);
    for(int i=0;i<current_token-1;i++){
        //printf("%s:%s\n",type2[token[i]->ttype],token[i]->t);
        if(token[i]->ttype==identifier){
            for(int j=0;j<KEYWORD_COUNT;j++){
                if(strcmp(token[i]->t,keywords2[j])){
                    token[i]->ttype=keyword;
                    break;
                }
            }
        }
        fprintf(of,"lnos:%d,%s:%s\n",token[i]->lineno,type2[token[i]->ttype],token[i]->t);
    }
    printf("%s","***********************\n");
    //END OF LEXING
    return 0;
}

要测试lexer，您需要创建一个名为" file“的源文件。在"file“中应该是源代码，后面跟着%EOF%字符串。例如，文件：

#include<stdio.h>

int main(){
int a = 5;
int b = a;
return 0;
}

%EOF%

编写程序：

gcc lexer.c -o lexer.out -Wall

输出：

Total nos of tokens = 26

lnos:1,symbol:#
lnos:1,identifier:include
lnos:1,symbol:<
lnos:1,identifier:stdio
lnos:1,symbol:.
lnos:1,identifier:h
lnos:1,symbol:>
lnos:3,identifier:int
lnos:3,identifier:main
lnos:3,symbol:(
lnos:3,symbol:)
lnos:3,symbol:{
lnos:4,identifier:int
lnos:4,identifier:a
lnos:4,symbol:=
lnos:4,number:5
lnos:4,symbol:;
lnos:5,identifier:int
lnos:5,identifier:b
lnos:5,symbol:=
lnos:5,identifier:a
lnos:5,symbol:;
lnos:6,identifier:return
lnos:6,number:0
lnos:6,symbol:;
lnos:7,symbol:}

其中，lnos是令牌出现的行号。源代码有很好的注释。问是否有什么不清楚。

稍后，我将在解析阶段使用这些标记。

我所寻求的只是一点指导。

我需要知道

这是否是一种有效的方法来写一个词汇。
你看到在我的代码中有什么不好的编码实践吗。
如果代码能改进的话。

compiler

lexical-analysis

回答 1

Code Review用户

回答已采纳

发布于 2019-12-22 21:27:35

以下是一些可以帮助您改进程序的事情。

不要将`define`用于类似函数的宏

使用类似功能的宏没有好处，也没有相当大的缺点。他们缺乏类型检查，往往会导致错误。例如，如果我们使用这个：

c = '0';
SYMBOL(++c);
printf("%c\n", c);

我们会看到它会打印"5“，因为与实际函数不同，宏每次在宏中提到它时都会增加值。

使用更多的空格来实现可读性

像这样的台词：

if(input[c]=='%'&&input[c+1]=='E'&&input[c+2]=='O'&&input[c+3]=='F'&&input[c+4]=='%'){

因为缺少空白，所以很难读懂。对于大多数人来说，像这样写的时候读起来容易得多：

if (input[c] == '%' && input[c + 1] == 'E' && input[c + 2] == 'O'
    && input[c + 3] == 'F' && input[c + 4] == '%') {

消除了“幻数”

代码中的宏之一是：

#define SYMBOL(term) (term>=123&&term<=126)||(term>=33&&term<=47)||(term>=58&&term<=64)||(term>=91&&term<=96)

然而，很难弄清楚这意味着什么，因为所有未命名的数值常数。最好是像您对LOWALPHA宏所做的那样，直接使用字符值。或者更好的是，按照下一个建议使用ispunct()。

使用标准功能和设施

有几个宏试图复制已经存在的函数。具体来说，isupper()和许多相关的函数都在<ctype.h>中。

不要泄漏内存

这段代码调用malloc几个地方，但从不调用free。这意味着代码正在泄漏内存。更好的做法是养成在每次调用free时都使用malloc的习惯，然后确保不会泄漏内存。

检查`malloc`

的返回值

如果程序内存不足，对malloc的调用可能失败。这方面的唯一指示是调用将返回一个NULL指针。您应该检查这一点，避免取消引用NULL指针(这通常会导致程序崩溃)。如果程序不能在没有内存的情况下继续运行，请释放任何分配的内存并优雅地退出该程序。

将程序分解为函数

这里的所有逻辑都是在一个相当长且密集的代码块中的main中。最好将其分解为不同的函数。

在适当的情况下使用`bool`

halt标志被用作布尔变量。如果您使用#include <stdbool.h>，您可以使用bool类型来更好地指示如何使用该类型。

简化了代码

该守则目前包含以下内容：

if (fp != NULL) {
    int j = 0;
    while (1) {
        symbol = fgetc(fp);
        //printf("%c",symbol);
        if (symbol != EOF)
            input[j++] = symbol;
        else {
            input[j++] = symbol;
            break;
        }
    }
    fclose(fp);
}

这比它所需要的要复杂和脆弱得多。它是脆弱的，因为没有什么可以阻止它在分配的input缓冲区之外运行。它也比多个while、if和else更复杂。它可以像这样写：

if (fp == NULL) {
    puts("Cannot open input file");
    exit(1);
}
fgets(input, BUFFLEN, fp);
fclose(fp);

还请注意，我使用的是BUFFLEN，而不是上面提到的“魔术数字”。同样，这段代码比它所需要的更复杂：

int halt = 0;
while (1) {
    if (halt)
        break;
    // more code
}

把它写成这样：

bool halt = false;
while (!halt) {

不要使用未初始化的内存

代码为token分配内存，但在实际初始化它之前测试它的某些字段的内容。如果需要将其设置为特定值，则应将其设置。如果零足够，请使用calloc。

不要用无用的东西来干扰代码

在许多情况下，我们有这样的代码：

token[current_token]->t = (char *)malloc(sizeof(char) * 30);

但是，不需要转换值，sizeof(char)被定义为始终为1。

token[current_token]->t = malloc(MAX_TOKEN_LEN);

同样，我们正在避免前面提到的“神奇数字”。

允许用户指定输入和输出文件

文件名目前是硬编码的，这无疑极大地限制了程序的有用性。考虑使用argc和argv来允许用户在命令行上指定文件名。另外，对于硬编码的文件名来说，file无疑是一个糟糕的选择，因为它没有告诉人们任何关于文件应该包含什么内容的有用信息。

使用状态机

您可能会发现最好使用状态机来进行此解析。有关如何执行此操作的示例，请参阅这个答案。

票数 5

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/234500

复制

相似问题

问这是写雷克萨斯的好方法吗？
EN

回答 1

Code Review用户

不要将`define`用于类似函数的宏

使用更多的空格来实现可读性

消除了“幻数”

使用标准功能和设施

不要泄漏内存

检查`malloc`

将程序分解为函数

在适当的情况下使用`bool`

简化了代码

不要使用未初始化的内存

不要用无用的东西来干扰代码

允许用户指定输入和输出文件

使用状态机

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问这是写雷克萨斯的好方法吗？EN

回答 1

Code Review用户

不要将define用于类似函数的宏

使用更多的空格来实现可读性

消除了“幻数”

使用标准功能和设施

不要泄漏内存

检查malloc

将程序分解为函数

在适当的情况下使用bool

简化了代码

不要使用未初始化的内存

不要用无用的东西来干扰代码

允许用户指定输入和输出文件

使用状态机

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问这是写雷克萨斯的好方法吗？
EN

不要将`define`用于类似函数的宏

检查`malloc`

在适当的情况下使用`bool`