词法分析器实现[c++]

#include <stdio.h>
#include <stdlib.h>
#include <fstream>
#include <vector>
#include <map>
#include <string.h>

using namespace std;

//约定:每一行程序以 \n结尾
//约定:标识符的最大长度为64

//关于注释 暂时仅支持"//"格式

//保留字,
const int KEYS_COUNT = 32;
static char static_key_words[32][20] = {
    "auto", "break", "case", "char", "const", "continue",
    "default", "do", "double", "else", "enum", "extern",
    "float", "for", "goto", "if", "int", "long",
    "register", "return", "short", "signed", "sizeof", "static",
    "struct", "switch", "typedef", "union", "unsigned", "void",
    "volatile", "while"
};

//常数规约,常数出现再赋值语句的右边

//运算符最多由2个字符组成
//多字符运算符开头
static char both_operator_com[10][2] = {
    ">", "<", "=", "-", "+",
    "!", "&", "|", "/", "*"
};

//注释判断逻辑
static char annotation_char[2] = "\\";

//常量关于字符串和字符
const int CONST_CHARS_COUNT = 2;
static char const_chars[2][2] = {
    "\"", "'"
};

//返回类型定义
const int SUCC = 0;
const int FAIL = 1;
const int ERROR = -1;

const int FILE_NOT_EXIT = 10;

//分割符, 不将'/'加入的原因是对于'/'和注释部分分开处理
const int DIVISION_CHARS_COUNT = 21;
static char division_chars[21][2] = {
    " ", ">", "<", "=", "-",
    "+", "!", "&", "|", "%",
    "*", ";", "(", ",", "/",
    ")", "{", "}", "[", "]",
    "."
};

const int DIVISION_MULTI_CHARS_COUNT = 13;
static char division_multi_chars[13][3] = {
    ">>", "<<", "<=", ">=", "+=",
    "-=", "*=", "/=", "!=", "&&",
    "*=", "||", "=="
};

//双字符校验
const int DIVISION_MULTI_CHAR_COUNT = 5;
static char division_multi_char[5][2] = {
    ">", "<", "=", "&", "|"
};

//单个运算符
const int SINGLE_OP_COUNT = 14;
static char single_operator[14][2] = {
    "+", "-", "*", "/", "<",
    ">", "=", "^", ",", "&",
    "|", "%", "~", "!"
};

//双字符运算符
const int MULTI_OP_COUNT = 13;
static char multi_operator[13][3] = {
    ">>", "<<", "<=", ">=", "+=",
    "-=", "*=", "/=", "!=", "&&",
    "==", "*=", "||"
};

//边界字符
const int LIMIT_COUNT = 8;
static char limit_[8][2] = {
    "(", ")", "{", "}", ".",
    "[", "]", ";"
};

//当前扫描状态,对于赋值语句而言有效,定义
int status = 0;

static char TAG[4] = "tag";         //标识符
static char KEY[4] = "key";         //保留字
static char CONST[6] = "const";     //常量
static char OP[3] = "op";           //运算符
static char LIMIT[6] = "limit";     //边界
static char ERR[6] = "error";       //错误

vector< pair<char*, char*> > tokens;    //tokens
vector<char*> anno;                     //注释

//内存初始化
void memset_(char* chrs, char c, int length)
{
    for(int i = 0; i < length; i++) {
        chrs[i] = c;
    }
}

//不是以单字符进行分隔符
int is_not_division_char(char c)
{
    for(int k = 0; k < DIVISION_CHARS_COUNT; k++) {
        if(c == division_chars[k][0]) {
            return FAIL;
        }
    }
    return SUCC;
}

//不是以可能的双字符进行分割
int is_not_multi_division_char(char c)
{
    for(int k = 0; k < DIVISION_MULTI_CHAR_COUNT; k++) {
        if(c == division_multi_char[k][0]) {
            return FAIL;
        }
    }
    return SUCC;
}

//是否为双字符分割符
int is_multi_division_chars(char* chrs)
{
    int len = strlen(chrs);
    if(len != 2) {
        return FAIL;
    }
    for(int i = 0; i < DIVISION_MULTI_CHARS_COUNT; i++) {
        if(!strcmp(chrs, division_multi_chars[i])) {
            return SUCC;
        }
    }
    return FAIL;
}

//是否为数字字符
int is_dig(char c)
{
    if(c > '9' || c < '0') {
        return FAIL;
    }
    return SUCC;
}

//是否为常量字符(字符串/字符),对开头字符进行检查
int is_const_char(char c)
{
    for(int k = 0; k < CONST_CHARS_COUNT; k++) {
        if(c == const_chars[k][0]) {
            return SUCC;
        }
    }
    return FAIL;
}


/** 将句子分割成单词
 * @param line 带分割字符串
 * @param words 引用的结果
 * @param line_ 行号
*/
void division_str(char* line, vector<char*> &words, int line_)
{
    int len = strlen(line);
    if(len < 1) {
        return;
    }
    char word[1025];
    for(int i = 0; i < len; i++)
    {
        memset_(word, '\0', 1025);
        int j = 0;
        int div_ = 0;
        for(; j < 1025 && i < len; j++, i++) {
            //printf("char:%c\n", line[i]);
            if(is_not_division_char(line[i]) == SUCC) {     //一般分割符
                if(line[i] == '\t' || line[i] == '\r') {
                    //printf("blank:%c\n", line[i]);
                    j--;
                    continue;
                }
                word[j] = line[i];
            } else {
                if(line[i] != ' ' && line[i] != '/') {
                    //单字符分割符和双字符分隔符
                    if(is_not_multi_division_char(line[i+1]) == FAIL) {    //双字符分割符
                        if(j != 0) {
                            char * word_ = new char[j+1];
                            strncpy(word_, word, j);
                            word_[j] = '\0';
                            words.push_back(word_);
                        }
                        char* div_multi_ = new char[3];
                        div_multi_[0] = line[i];
                        div_multi_[1] = line[i+1];
                        if(is_multi_division_chars(div_multi_) == SUCC) {
                            i++;
                            words.push_back(div_multi_);
                        }
                        //printf("==multi %s\n", div_multi_);
                        div_ = 1;
                    } else {
                        if(j != 0) {
                            char * word_ = new char[j+1];
                            strncpy(word_, word, j);
                            word_[j] = '\0';
                            words.push_back(word_);
                        }
                        //单子符分割符
                        char* div_single_ = new char[2];
                        div_single_[0] = line[i];
                        div_single_[1] = '\0';
                        words.push_back(div_single_);

                        div_ = 1;
                        //printf("==single %s\n", div_single_);
                    }
                } else if(line[i] == '/') {
                    //争对注释"//"的处理
                    if(line[i+1] == '/') {
                        i++;
                        for(; i < len; i++, j++) {
                            word[j] = line[i];
                        }
                        char * word_ = new char[j+1];
                        strncpy(word_, word, j);
                        word_[j] = '\0';
                        //printf("(%s)\n", word_);
                        anno.push_back(word_);
                        div_ = 1;
                    } else {
                        if(j != 0) {
                            char * word_ = new char[j+1];
                            strncpy(word_, word, j);
                            word_[j] = '\0';
                            words.push_back(word_);
                        }
                        //对除号的处理
                        char* div_single_ = new char[2];
                        div_single_[0] = line[i];
                        div_single_[1] = '\0';
                        words.push_back(div_single_);
                        div_ = 1;
                    }
                }
                //printf("[%c]\n", line[i]);
                break;

            }
        }
        word[j] = '\0';
        //printf("== %s\n", word);
        //存在标识符长度超过64
        if(j == 1025 && is_not_division_char(line[i+1])) {
            i++;
            //继续遍历字符串直到遇到分割符,如果没有遇到就分析结束
            for(; is_not_division_char(line[i]) && i < len; i++) {}
        } else if(!div_ && j != 0) {
            char * word_ = new char[j+1];
            strncpy(word_, word, j);
            word_[j] = '\0';
            //printf("(%s)\n", word_);
            words.push_back(word_);
        }
    }
}

//是否为标识符
int is_tag(char* chrs)
{
    int len = strlen(chrs);
    //输入字符串有效性校验
    if(len < 1) {
        return FAIL;
    }
    //开头字符
    if(chrs[0] != '_' &&
       (
           chrs[0] < 'A'
           || (chrs[0] > 'Z' && chrs[0] <'a')
           || (chrs[0] > 'z')
        )
    ) {
        return FAIL;
    }
    //利用
    for(int i = 1; i < strlen(chrs); i++)
    {
        //遍历字符串中不符合规定的字符
        if(
            chrs[i] != '_'
            && (
                (chrs[i] > '9' && chrs[i] < 'A')
                || (chrs[0] > 'Z' && chrs[0] <'a')
                || (chrs[0] > 'z')
            )
        ) {
            return FAIL;
        }
    }
    return SUCC;
}

//是否为常量
int _is_const(char* chrs)
{
    //从开始字符进行分流
    int len = strlen(chrs);
    if(len < 1) {
        //空字符情况排除
        return FAIL;
    }
    if(is_const_char(chrs[0]) == SUCC && chrs[len-1] == chrs[0]) {
        int i = 1;
        for(; i < len; i++) {
            if(chrs[i] == chrs[0] && chrs[i-1] != '\\') {
                break;
            }
        }
        if(
           i == 2   //考虑到空字符串的可能
           && i != len  //并且字符串终结符并不再结尾
        ) {
            return ERROR;
        }
        return SUCC;
    } else if(is_dig(chrs[0]) == SUCC) {
        int i = 1;
        for(; i < len; i++) {
            if(is_dig(chrs[i]) != SUCC) {
                break;
            }
        }
        if(i != len) {
            return ERROR;
        }
        return SUCC;
    }
    return FAIL;
}

//是否为单字符运算符
int is_op(char c)
{
    for(int k = 0; k < SINGLE_OP_COUNT; k++) {
        if(c == single_operator[k][0]) {
            return SUCC;
        }
    }
    return FAIL;
}

//是否为双字符类型运算符
int is_multi_op(char* chrs)
{
    int len = strlen(chrs);
    if(len != 2) {
        return FAIL;
    }
    for(int i = 0; i < MULTI_OP_COUNT; i++) {
        //printf("op[%s, %s]\n", chrs, multi_operator[i]);
        if(!strcmp(chrs, multi_operator[i])) {
            return SUCC;
        }
    }
    return FAIL;

}

//是否为保留字[关键字]
int is_keys(char* chrs)
{
    int len = strlen(chrs);
    if(len < 1) {
        return FAIL;
    }
    for(int i = 0; i < KEYS_COUNT; i++) {
        int klen = strlen(static_key_words[i]);
        if(klen != len) {
            continue;
        }
        int j = 0;
        for(; j < klen && j < len; j++) {
            if(chrs[j] != static_key_words[i][j]) {
                break;
            }
        }
        if(j == klen) {
            return SUCC;
        }
    }
}

//是否为边界字符
int is_limit(char c)
{
    for(int i = 0; i < LIMIT_COUNT; i++) {
        if(c == limit_[i][0]) {
            return SUCC;
        }
    }
    return FAIL;
}

//判断是否为注释"//",注释的特征:
//  1、一个单词的开头或者一个句子的开头
//  2、将其后续的字符串取完
int is_anno(char* chrs)
{
    int len = strlen(chrs);
    if(len < 1) {
        return FAIL;
    }
    if(chrs[0] != '/') {
        return FAIL;
    }
    return SUCC;
}

int read_sour(char* file_name, vector<char*> &output)
{
    ifstream fin;
    fin.open(file_name);
    if(!fin) {
        return FILE_NOT_EXIT;
    }
    string line;
    while(!fin.eof()) {
        line = "";
        getline(fin, line, '\n');
        int slen = line.length();
        char * line_ = new char[slen+1];
        memset_(line_, '\0', slen);
        strncpy(line_, line.c_str(), slen);
        line_[slen] = '\0';
        output.push_back(line_);
    }
    fin.close();
    return SUCC;
}

int collect_words(vector<char*> &input, vector<char*> &output)
{
    int size_ = input.size();
    if(size_ < 1) {
        return FAIL;
    }
    for(int i = 0; i < size_; i++) {
        //printf("-div:%s \n", input[i]);
        division_str(input[i], output, i);
    }
    return SUCC;
}

//单词分类--词法分析
void word_analize(vector<char*> words, vector< pair<char*, char*> > &result)
{
    //int op_type = 0;    //0开始 1接收了一个字符 2接收了两个字符
    char multi_op[3];
    int size_ = words.size();
    //printf("size: %d\n", size_);
    int ret;
    for(int i = 0; i < size_; i++) {
        //printf("-%d\n", i);
        ret = is_limit(words[i][0]);
        if(ret == SUCC) {
            printf("<%s, %s>\n", LIMIT, words[i]);
            result.push_back(pair<char*, char*>(LIMIT, words[i]));
            continue;
        }
        ret = is_keys(words[i]);
        if(ret == SUCC) {
            printf("<%s, %s>\n", KEY, words[i]);
            result.push_back(pair<char*, char*>(KEY, words[i]));
            continue;
        }

        ret = is_op(words[i][0]);
        if(ret == SUCC) {   //第一层,单运算符
            printf("<%s, %s>\n", OP, words[i]);
            result.push_back(pair<char*, char*>(OP, words[i]));
            continue;
        }
        ret = is_multi_op(words[i]);
        if(ret == SUCC) {
            printf("<%s, %s>\n", OP, words[i]);
            result.push_back(pair<char*, char*>(OP, words[i]));
            continue;
        }
        ret = _is_const(words[i]);
        if(ret == SUCC) {
            printf("<%s, %s>\n", CONST, words[i]);
            result.push_back(pair<char*, char*>(CONST, words[i]));
            continue;
        } else if(ret == ERROR) {
            printf("<%s, %s>\n", ERR, words[i]);
            result.push_back(pair<char*, char*>(ERR, words[i]));
            continue;
        }
        ret = is_tag(words[i]);
        if(ret == SUCC) {
            printf("<%s, %s>\n", TAG, words[i]);
            result.push_back(pair<char*, char*>(TAG, words[i]));
            continue;
        }
        printf("<%s, %s>\n", ERR, words[i]);
        result.push_back(pair<char*, char*>(ERR, words[i]));
    }
}

//测试用
void scan_words(vector<char*> words)
{
    int size_ = words.size();
    for(int i = 0; i < size_; i++) {
        printf("line: %d, content: %s\n", i, words[i]);
    }
}

//主程序进程
int main()
{
    static char FILE_NAME[32] = {"sour.code"};
    vector<char*> lines;
    vector<char*> words;
    int ret = read_sour(FILE_NAME, lines);
    if(ret == FILE_NOT_EXIT) {
        printf("file[%s] not exits\n", FILE_NAME);
        return -1;
    }
    collect_words(lines, words);
    word_analize(words, tokens);

}

少量分析在代码中~~ 写c++太累了...

发布了31 篇原创文章 · 获赞 3 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_36557960/article/details/90575093