本人的分享均来自于实际设计过程中的感悟
不能保证分享成果的正确性,如有错误,请各路大神指出,我会虚心学习,感谢!!!
????????有时候我很好奇,编译器是如何知道我们输入的代码是什么意思的,他是如何把我们的代码编译成二进制可执行文件的呢。今天我们来实现编译器的第一步,一个非常简单的C语言词法分析器。
用于测试的代码文件,hello.c代码如下:
uint a=2147483649,b=321;
double c=111.1;
string str="ABC123\n";
int main(int aa,int bb)
{
int x=0,y=3;
a++;
a--;
if(a!=b)
{
a=1;
}
else
{
a=2;
}
printf("ABC %d '\" \\ 123\r\n",a);
}
int add(int a1,int a2)
{
return a1+a2;
}
词法分析器的代码如下lexer.h:
#ifndef LEXER_H
#define LEXER_H
#include <QString>
#include <QObject>
#include <QList>
#include <QMap>
#include <QDebug>
#include <QMetaEnum>
//单词类型
enum TokenType
{
ID,//关键词 函数 全局变量 关键字 系统函数
NUM, //数字
STRING, //字符串
OP//操作符分割符
};
//单词属性
class Token
{
public:
QStringList TokenType_str={"ID","NUM","STRING","OP"};
QString word;//单词内容
TokenType type;//单词类别
uint line;//单词所在行
Token()
{
}
Token(QString word,TokenType type,uint line)
{
this->word=word;
this->type=type;
this->line=line;
}
void prt()
{
QString s="%1: %2 > %3 ";
s=s.arg(line,5).arg(TokenType_str[type],10).arg(word,10);
qDebug()<<s;
}
};
//词法分析器
class Lexer
{
private:
QString codestr;
uint line;
public:
Lexer();
Lexer(QString code);
QList<Token> run(QString code="");//词法解析
};
#endif // LEXER_H
lexer.cpp
#include "lexer.h"
Lexer::Lexer()
{
codestr="";
line=1;
}
Lexer::Lexer(QString code)
{
codestr=code;
line=1;
}
QList<Token> Lexer::run(QString code)
{
QList<Token> tokens;
if(code.length()>0)
{
codestr=code;
}
if(codestr.length()>0)
{
//开始解析
QByteArray local8Bit = codestr.toLocal8Bit();
char* p=local8Bit.data();//临时指针
char tk=*p;
//遍历字符串
while((tk=*p++)!='\0')
{
//tk代表当前字符 ntk*p代表后一个字符
if(tk=='\n'){line++;}//行数统计
else if(tk=='#'){while (*p != 0 && *p != '\n') ++p;}//忽略#关键字,不支持
else if ((tk >= 'a' && tk <= 'z') || (tk >= 'A' && tk <= 'Z') || tk == '_') {//解析ID
QString str(tk);
while ((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '_')
{
str.append(*p++);
}
Token token(str,TokenType::ID,line);
tokens.append(token);
}
else if (tk >= '0' && tk <= '9') {//解析数字
QString str(tk);
while ((*p >= '0' && *p <= '9') || (*p >= 'a' && *p <= 'f') || (*p >= 'A' && *p <= 'F')
||*p == 'x' || *p == 'X'
||*p == 'b' || *p == 'B'
||*p == '.' || *p == 'e'
)
{
str.append(*p++);
}
Token token(str,TokenType::NUM,line);
tokens.append(token);
}
else if (tk == '/') {
if (*p == '/') {//忽略注释
++p;
while (*p != 0 && *p != '\n') ++p;
}
else if(*p=='*')//忽略多行注释
{
++p;
while (*p!='\0'){
char c1=*p;
char c2=*(p+1);
if(c1=='*' && c2=='/')
{
++p;++p;
break;
}
++p;
}
}
else {
Token token("/",TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == '\'' || tk == '"') {
QString str;
while (*p != 0 && *p != tk) {
if(*p=='\\')
{
char nc=*++p;//xia'yi'ge下一个字符
if(nc=='n' || nc=='r' || nc=='t' || nc =='\'' || nc=='\"' || nc=='\\')//转义字符
{
if(nc=='n')str.append('\n');
if(nc=='r')str.append('\r');
if(nc=='t')str.append('\t');
if(nc=='\'')str.append('\'');
if(nc=='\"')str.append('\"');
if(nc=='\\')str.append('\\');
++p;
}
}
else
{
str.append(*p++);
}
}
++p;
if (tk == '"'){
Token token(str,TokenType::STRING,line);
tokens.append(token);
}
else
{
Token token(str,TokenType::NUM,line);
tokens.append(token);
}
}
else if (tk == '=' || tk == '+' || tk == '-' || tk == '|' || tk == '&')
{
char tk_next = *p;//检查下个字符串是否相同
if (tk_next == tk)
{
p++;
QString str;
str.append(tk);
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
else
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == '!' || tk == '>' || tk == '<')
{
char tk_next = *p;//检查下个字符串是否相同
if (tk_next == '=')
{
p++;
QString str;
str.append(tk);
str.append(tk_next);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
else
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
else if (tk == '~' || tk == ';' || tk == '{' || tk == '}' || tk == '(' || tk == ')' || tk == ']' || tk == ',' || tk == ':')
{
QString str;
str.append(tk);
Token token(str,TokenType::OP,line);
tokens.append(token);
}
}
}
//打印单词信息
foreach (Token t, tokens) {
t.prt();
}
qDebug()<<"=================================================";
return tokens;
}
在main.c文件中调用词法分析器分析hello.c中的代码:
#include <QCoreApplication>
#include <QFile>
#include <lexer.h>
#include <parser.h>
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
QFile file("./hello.c");
file.open(QFile::ReadOnly);
QByteArray localReadAll = file.readAll();
file.close();
QString code=QString::fromUtf8(localReadAll);
//开始词法分析
Lexer lx(code);
QList<Token> tokens = lx.run();
return a.exec();
}
执行后的结果:
" 3: ID > uint "
" 3: ID > a "
" 3: OP > = "
" 3: NUM > 2147483649 "
" 3: OP > , "
" 3: ID > b "
" 3: OP > = "
" 3: NUM > 321 "
" 3: OP > ; "
" 4: ID > double "
" 4: ID > c "
" 4: OP > = "
" 4: NUM > 111.1 "
" 4: OP > ; "
" 5: ID > string "
" 5: ID > str "
" 5: OP > = "
" 5: STRING > ABC123\n "
" 5: OP > ; "
" 7: ID > int "
" 7: ID > main "
" 7: OP > ( "
" 7: ID > int "
" 7: ID > aa "
" 7: OP > , "
" 7: ID > int "
" 7: ID > bb "
" 7: OP > ) "
" 8: OP > { "
" 9: ID > int "
" 9: ID > x "
" 9: OP > = "
" 9: NUM > 0 "
" 9: OP > , "
" 9: ID > y "
" 9: OP > = "
" 9: NUM > 3 "
" 9: OP > ; "
" 10: ID > a "
" 10: OP > ++ "
" 10: OP > ; "
" 11: ID > a "
" 11: OP > -- "
" 11: OP > ; "
" 12: ID > if "
" 12: OP > ( "
" 12: ID > a "
" 12: OP > != "
" 12: ID > b "
" 12: OP > ) "
" 13: OP > { "
" 14: ID > a "
" 14: OP > = "
" 14: NUM > 1 "
" 14: OP > ; "
" 15: OP > } "
" 16: ID > else "
" 17: OP > { "
" 18: ID > a "
" 18: OP > = "
" 18: NUM > 2 "
" 18: OP > ; "
" 19: OP > } "
" 20: ID > printf "
" 20: OP > ( "
" 20: STRING > ABC %d '\" \\ 123\r\n "
" 20: OP > , "
" 20: ID > a "
" 20: OP > ) "
" 20: OP > ; "
" 21: OP > } "
" 25: ID > int "
" 25: ID > add "
" 25: OP > ( "
" 25: ID > int "
" 25: ID > a1 "
" 25: OP > , "
" 25: ID > int "
" 25: ID > a2 "
" 25: OP > ) "
" 26: OP > { "
" 27: ID > return "
" 27: ID > a1 "
" 27: OP > + "
" 27: ID > a2 "
" 27: OP > ; "
" 28: OP > } "
=================================================
可以看出,词法分析器,把代码中的关键字,操作符,字符串,分割符等都分离出来了,当然这是一个超级简单的词法分析器,功能并不完善,只是为了让我们了解编译过程中,编译器做的哪些工作。
|