我正在设计我自己的编程语言,除了我看的代码之外,我完全没有计算机科学方面的经验。
我创建了我的第一个lexer,我已经对它进行了足够的改进,以便lexer实际上可以lex它自己的源代码。
我在这里要问的是,我怎样才能改进这个lexer?
#ifndef __LEX_H__
#define __LEX_H__
#define is_alphabetic(c) (\
((c) >= 'a' && (c) <= 'z')\
|| ((c) >= 'A' && (c) <= 'Z')\
|| (c) == '_'\
|| ((c) < 0)) // lazy unicode support
#define is_potential_identifier(c) (\
((c) >= 'a' && (c) <= 'z')\
|| ((c) >= 'A' && (c) <= 'Z')\
|| ((c) >= '0' && (c) <= '9')\
|| (c) == '_'\
|| ((c) < 0))
#define is_space(c) ( (c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n' || (c) == '\v' || (c) == '\f' )
#define is_numeric(c) ( (c) >= '0' && (c) <= '9' )
#define is_hex(c) ( ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F') )
#define is_numeral(c) (is_numeric((c)) || is_hex((c)))
/* tokens enum */
enum n_token {
Invalid = 0,
Equal, // =
EqualCmp, // ==
Semicolon, // ;
Plus, // +
PlusEqual, // +=
Increment, // ++
Dash, // -
MinusEqual, // -=
Decrement, // --
Asterisk, // *
MultEqual, // *=
DivSlash, // /
DivEqual, // /=
LeftParens, // (
RiteParens, // )
NumIdent, // vsnjdfn
NumConstant, // 548348
NumConstantHex, // 0x4E24FDA
LeftSqBracket, // [
RightSqBracket, // ]
LeftCurlBrace, // {
RightCurlBrace, // }
Dot, // .
Colon, // :
Comma, // ,
LeftArrow, // <
LeftBitShift, // <<
LeftBitShiftEqual, // <<=
LessEqual, // <=
RightArrow, // >
RightBitShift, // >>
RightBitShiftEqual, // >>=
GreaterEqual, // >=
NumConstantReal, // 453.54354
QuestionMark, // ?
HashSym, // #
Ampersand, // &
AndEqual, // &=
BoolAnd, // &&
Carot, // ^
XorEqual, // ^=
Percent, // %
ModuloEqual, // %=
ExclamationMark, // !
NotEqual, // !=
VerticalBar, // |
OrEqual, // |=
BoolOr, // ||
Tilde, // ~
StringConstant, // "352dfsgnj34"
CharConstant, // 's'
LeftSlash, // '\\'
Keyword, // struct
AtSign, // @
Ellipses, // ...
Arrow, // ->
NumConstantHexFloat, // 0x0.3p10
DollarSign, // $
};
struct s_token {
char word[512];
enum n_token toktype;
};
struct lexer {
unsigned int count, size;
struct s_token *array;
};
void tokenize_string(char*, struct lexer*);
void print_tokens_colored(struct lexer*);
#endiflex.c
#include "lex.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static int is_array_full(struct lexer *vec)
{
return (vec->count >= vec->size);
}
static void resize_array(struct lexer *vec)
{
vec->size <<= 1;
vec->array = realloc(vec->array, sizeof(struct s_token) * vec->size);
}
static void add_to_array(struct lexer *vec, enum n_token tok, char *codestring, unsigned int size)
{
if (is_array_full(vec))
resize_array(vec);
vec->array[vec->count].toktype = tok;
strncpy( vec->array[vec->count].word, codestring, size );
vec->count++;
}
/*
lol here some code
*/
#define reset_string(str) memset(wording, '\0', sizeof((str)));\
i=0;
#define PrintIter printf("*iter => %c\n", *iter);
#define KEYWORDS 34
void tokenize_string(char *code, struct lexer *vec)
{
char *iter = code;
char wording[512] = "";
unsigned int i = 0;
const char *keywords[KEYWORDS] = {
"auto", "const", "double", "float", "int", "short", "struct", "unsigned",
"break", "continue", "else", "for", "long", "signed", "switch", "void",
"case", "default", "enum", "goto", "register", "sizeof", "typedef", "volatile",
"char", "do", "extern", "if", "return", "static", "union", "while", "inline", "alignof"
};
while ( *iter != '\0' ) {
while ( is_space(*iter) && *iter != '\0' )
++iter;
if (*iter == '/' && iter[1] == '*') { // found C style /**/ comment
do {
++iter;
}
while ( !(*iter == '*' && iter[1] == '/') );
iter += 2;
}
if (*iter == '/' && iter[1] == '/') { // found C++ style // comment
while ( *iter != '\n' )
++iter;
}
if (*iter == '\\' && iter[1] == '\n') { // formatting Left slash check
add_to_array(vec, LeftSlash, "\\", 2);
iter += 2;
}
if (*iter == '\"') { // found string literal, adjust for "\\" so we won't crash
wording[i++] = *iter++;
while ( *iter != '\"' ) {
if (*iter == '\\' && iter[1] == '\"' && iter[-1] != '\\') {
wording[i++] = *iter++;
}
wording[i++] = *iter++;
}
wording[i++] = *iter++;
}
if (wording[0] != '\0') {
//printf("wording => %s\n\n", wording);
add_to_array(vec, StringConstant, wording, i+1);
reset_string(wording);
}
if ( *iter == '\'' ) { // found character literal, adjust for '\\' so we won't crash
wording[i++] = *iter++;
int counter=0;
while (*iter != '\'' && counter < 2) {
if (*iter == '\\' && iter[1] == '\'' && iter[-1] != '\\') {
wording[i++] = *iter++;
}
wording[i++] = *iter++;
++counter;
}
wording[i++] = *iter++;
}
if (wording[0] != '\0') {
add_to_array(vec, CharConstant, wording, i+1);
reset_string(wording);
}
if (*iter == '0' && (iter[1] == 'x' || iter[1] == 'X')) { // found hexadecimal constant
wording[i++] = *iter++; // copy both 0 and x
wording[i++] = *iter++;
while ( is_numeral(*iter) ) {
wording[i++] = *iter++;
}
if ( *iter == '.' && is_numeral(iter[1]) ) { // found hexadecimal float
wording[i++] = *iter++;
while ( is_numeral(*iter) )
wording[i++] = *iter++;
if (*iter == 'p' && is_numeral(iter[1])) { // stuff like 0x0.3p10.
wording[i++] = *iter++;
while ( is_numeral(*iter) )
wording[i++] = *iter++;
}
if (wording[0] != '\0') {
add_to_array(vec, NumConstantHexFloat, wording, i+1);
reset_string(wording);
}
}
else {
if (wording[0] != '\0') {
add_to_array(vec, NumConstantHex, wording, i+1);
reset_string(wording);
}
}
}
while ( is_numeric(*iter) ) { // found decimal constant
wording[i++] = *iter++;
}
if ( *iter == '.' && is_numeric(iter[1]) ) { // found floating point number
wording[i++] = *iter++;
while ( is_numeric(*iter) )
wording[i++] = *iter++;
if ( (*iter == 'p' || *iter == 'P' || *iter == 'e' || *iter == 'E') && is_numeric(iter[1]) )
{
wording[i++] = *iter++;
while ( is_numeric(*iter) )
wording[i++] = *iter++;
}
if (*iter == 'f') // stuff like 2.0f etc.
wording[i++] = *iter++;
if (wording[0] != '\0') {
add_to_array(vec, NumConstantReal, wording, i+1);
reset_string(wording);
}
}
else {
if (wording[0] != '\0') {
add_to_array(vec, NumConstant, wording, i+1);
reset_string(wording);
}
}
if (is_alphabetic(*iter)) { // found an identifier or potential keyword
while (is_potential_identifier(*iter))
wording[i++] = *iter++;
}
if (wording[0] != '\0') {
int x;
int found_keyword = 0;
for ( x=0 ; x<KEYWORDS ; ++x ) {
if ( !strcmp(wording, keywords[x]) ) {
found_keyword = 1;
}
}
if (found_keyword)
add_to_array(vec, Keyword, wording, i+1);
else add_to_array(vec, NumIdent, wording, i+1);
reset_string(wording);
}
switch ( *iter ) {
case '=':
if (iter[1] == '=') {
++iter;
add_to_array(vec, EqualCmp, "==", 3);
}
else add_to_array(vec, Equal, "=", 2);
break;
case ';':
add_to_array(vec, Semicolon, ";", 2);
break;
case ':':
add_to_array(vec, Colon, ":", 2);
break;
case '+': // possible uses => left unary is positive, twice unary is increment, once binary is addition
if (iter[1] == '=') {
++iter;
add_to_array(vec, PlusEqual, "+=", 3);
}
else if (iter[1] == '+') {
++iter;
add_to_array(vec, Increment, "++", 3);
}
else add_to_array(vec, Plus, "+", 2);
break;
case '-': // possible uses => left unary is negating, twice unary is decrement, one binary is minus
if (iter[1] == '=') {
++iter;
add_to_array(vec, MinusEqual, "-=", 3);
}
else if (iter[1] == '-') {
++iter;
add_to_array(vec, Decrement, "--", 3);
}
else if (iter[1] == '>') {
++iter;
add_to_array(vec, Arrow, "->", 3);
}
else add_to_array(vec, Dash, "-", 2);
break;
case '*': // leftward unary is dereferencing ptr, binary be mult. Also check for / as ending comment
if (iter[1] == '=') {
++iter;
add_to_array(vec, MultEqual, "*=", 3);
}
else add_to_array(vec, Asterisk, "*", 2);
break;
case '/': // check for * and / as comment EDIT: DONE
if (iter[1] == '=') {
++iter;
add_to_array(vec, DivEqual, "/=", 3);
}
else add_to_array(vec, DivSlash, "/", 2);
break;
case '(':
add_to_array(vec, LeftParens, "(", 2);
break;
case ')':
add_to_array(vec, RiteParens, ")", 2);
break;
case '[':
add_to_array(vec, LeftSqBracket, "[", 2);
break;
case ']':
add_to_array(vec, RightSqBracket, "]", 2);
break;
case '{':
add_to_array(vec, LeftCurlBrace, "{", 2);
break;
case '}':
add_to_array(vec, RightCurlBrace, "}", 2);
break;
case '.':
if (iter[1] == '.' && iter[2] == '.') {
iter += 2;
add_to_array(vec, Ellipses, "...", 4);
}
else add_to_array(vec, Dot, ".", 2);
break;
case ',':
add_to_array(vec, Comma, ",", 2);
break;
case '<':
if (iter[1] == '<') {
if (iter[2] == '=') {
add_to_array(vec, LeftBitShiftEqual, "<<=", 4);
iter += 2;
}
else {
add_to_array(vec, LeftBitShift, "<<", 3);
++iter;
}
}
else if (iter[1] == '=') {
add_to_array(vec, LessEqual, "<=", 3);
++iter;
}
else add_to_array(vec, LeftArrow, "<", 2);
break;
case '>':
if (iter[1] == '>') {
if (iter[2] == '=') {
add_to_array(vec, RightBitShiftEqual, ">>=", 4);
iter += 2;
}
else {
add_to_array(vec, RightBitShift, ">>", 3);
++iter;
}
}
else if (iter[1] == '=') {
add_to_array(vec, GreaterEqual, ">=", 3);
++iter;
}
else add_to_array(vec, RightArrow, ">", 2);
break;
case '?':
add_to_array(vec, QuestionMark, "?", 2);
break;
case '#':
add_to_array(vec, HashSym, "#", 2);
break;
case '&':
if (iter[1] == '=') {
++iter;
add_to_array(vec, AndEqual, "&=", 3);
}
else if (iter[1] == '&') {
++iter;
add_to_array(vec, BoolAnd, "&&", 3);
}
else add_to_array(vec, Ampersand, "&", 2);
break;
case '^':
if (iter[1] == '=') {
++iter;
add_to_array(vec, XorEqual, "^=", 3);
}
else add_to_array(vec, Carot, "^", 2);
break;
case '%':
if (iter[1] == '=') {
++iter;
add_to_array(vec, ModuloEqual, "%=", 3);
}
else add_to_array(vec, Percent, "%", 2);
break;
case '!':
if (iter[1] == '=') {
++iter;
add_to_array(vec, NotEqual, "!=", 3);
}
else add_to_array(vec, ExclamationMark, "!", 2);
break;
case '|':
if (iter[1] == '=') {
++iter;
add_to_array(vec, OrEqual, "|=", 3);
}
else if (iter[1] == '|') {
++iter;
add_to_array(vec, BoolOr, "||", 3);
}
else add_to_array(vec, VerticalBar, "|", 2);
break;
case '~':
add_to_array(vec, Tilde, "~", 2);
break;
case '@':
add_to_array(vec, AtSign, "@", 2);
break;
}
++iter;
}
}
void print_tokens_colored(struct lexer *vec)
{
#define KNRM "\x1B[0m" // Normal
#define KRED "\x1B[31m"
#define KGRN "\x1B[32m"
#define KYEL "\x1B[33m"
#define KBLU "\x1B[34m"
#define KMAG "\x1B[35m"
#define KCYN "\x1B[36m"
#define KWHT "\x1B[37m"
#define RESET "\033[0m" // Reset obviously
int i;
for (i=0 ; i<vec->count ; ++i) {
switch (vec->array[i].toktype) {
case NumConstantHex:
case NumConstant:
case StringConstant:
case CharConstant:
case NumConstantReal:
printf("token #%i => %s%s%s\n", i, KMAG, vec->array[i].word, RESET);
break;
case NumIdent:
printf("token #%i => %s%s%s\n", i, KCYN, vec->array[i].word, RESET);
break;
case Keyword:
printf("token #%i => %s%s%s\n", i, KRED, vec->array[i].word, RESET);
break;
default:
printf("token #%i => %s%s%s\n", i, KGRN, vec->array[i].word, RESET);
}
}
}发布于 2017-01-02 09:44:35
sizeof,而不是sizeof引用类型。对于OP代码,为了确定在lex.c中函数中是否使用了正确的强制转换,审阅者必须与另一个文件lex.h中的代码进行交叉检查。通过使用引用的对象,代码更有可能在一开始就被正确编码,更容易检查,更容易更新。//这是否是正确的类型/ size ? // vec->array = realloc( vec->array,size Is (Struct s_token) * vec->size);//肯定大小为vec->array= realloc(vec->array,size Is *(vec->array) *vec->size Is*(vec->array)* vec->size);strcpy()就足够了。如果代码与溢出有关,则添加测试。// strncpy( vec->数组vec->计数.word,码串,大小);断言(strlen(码串)< size vec vec->arrayvec->计数.word);strcpy( vec>数组vec->计数.word,码串);512 is char wording[512]当然是从char word[512];中的512派生的,使用宏来定义或派生#define LEXER_N 512。void tokenize_string(char *code,struct *vec) {. // char 512= "";char措辞大小vec>数组_>字= "";is_hex()的功能与标准C函数isxdigit()不同,后者在isxdigit('4')上返回非零(truth)。建议: //定义is_hex(c) (c) >= 'a‘&& (c) <= 'f') . #define is_hex(c) (is_numeric(c) (C) <= 'a’&(C)<= 'f') .enum n_token的范围将被评估。考虑:枚举n_token {无效= 0,等于,// = EqualCmp,// == .NumConstantHexFloat,// 0x0.3p10 DollarSign,// $ n_token_N // add };KEYWORDS或幻数34 // const *关键字关键词={ const *keywords[] ={ "auto“、"const”、"double“、"float”、"int“、"short”、"struct",其余的};// for ( x=0;xswitch ( *iter ) {有一个default:案例来捕获格式错误的文本。is_space('\0')肯定是错误的,那么简化吧。// while ( is_space( *iter ) &*iter != '\0‘) while (is_space(*iter))enum n_token的列表,字母化它-更容易调试/维护。AndEqual,// &= Arrow,// -> AtSign,// @发布于 2017-01-02 04:46:21
这是一种非常简单的方法来编写一个lexer,因此非常容易读懂。干得好!以下是我对如何加以改进的想法:
一般来说,除非你有一种非常简单的特定于领域的语言,否则最好不要写你自己的词汇。这是一个很好的练习,但在现实世界中,您应该使用像lex这样的工具。输入文件显着地更小,更容易理解,并且不太可能包含bug。但我想你这么做是为了教自己这个过程,这是个好主意!
你用了很多宏。这方面有两个问题:
一般来说,很长的函数,特别是很长的循环,很难理解。在阅读代码时,需要花费更多的精力才能将所有移动的部分保留在脑海中。我建议将tokenize_string()分解成几十个较小的函数。例如,我可能会将while循环更改为如下所示:
while ( *iter != '\0' ) {
trim_leading_whitespace(&iter);
remove_C_comments(&iter);
remove_Cpp_comments(&iter);
handle_newline(&iter, vec);
//... etc.这使得代码的意图更清晰、更容易理解。很有可能的是,这些短函数将自动内联,它将与现在一样快。
时处理问题
在add_to_array()中,首先检查数组是否已满,如果是,则调整数组的大小。然后你做一些可能会填满数组的事情。这在我看来是反向的。我会在增量vec->count之后立即放置检查并调整大小。的确,在非常罕见的情况下,您可能会分配额外的内存,在这种情况下,您分配的内存与您需要的内存完全相同,但这种情况很少见,不太可能出现问题。
有问题的是,您没有处理来自realloc()的潜在错误。如果它不能分配更多的内存,它将返回NULL,vec->array将无效。(一些消息人士说,它将继续指向原来的位置-这也是一个问题,因为你没有分配更多的内存,但vec->size说内存更大。不管怎样,这都是个问题。此外,代码中任何地方都没有对free()的一个调用。你会漏掉很多记忆的!
我注意到s_token.word成员仅限于512个字节。为什么要限制它?如果你要限制它,为什么是512呢?如果对512有一些意义,就给它一个命名的常量,就像您对KEYWORDS所做的那样。
const 说到常量,如果函数参数不被函数更改,则应该将它们标记为const。例如,在add_to_array()中,令牌被复制,字符串也被复制。这意味着原件不会更改,因此您应该将tok、codestring和size标记为const。它告诉编译器和读者,他们不需要担心函数中那些值的变化。
#defines我注意到您正在定义KNRM、KRED等,难道不应该用标准的标头来定义吗?如果不是,名字可能会更清晰一些,或者至少会有一条评论来解释它们是什么。
https://codereview.stackexchange.com/questions/151427
复制相似问题