GCC编译流程:从源代码到可执行程序——浅析编译原理

未完待续》》》
敬请等候~~~

在学校的时候你是否听闻名为《编译原理》的一门课程,是否在课堂上被老师问道“知道你写的代码是怎么变成可执行文件运行的???”,又是否被它一大堆的“编译”、“预处理”、“宏定义展开”、“抽象语法树”、“汇编”、“反汇编”等等一大堆令人头大的内容劝退了学习它的动力。
就让我们重温一遍程序编译过程,我们常说的编译是一般指将一个或多个源代码文件通过编译器生成可执行文件的过程。程序的编译的过程可主要分为一下三个阶段:

  • (1) 编译阶段
  • (2) 链接阶段
  • (3) 重定位阶段
    编译流程

高清PNG图下载地址:https://download.csdn.net/download/qq_36393978/85329800
超高清PDF文件下载地址:https://download.csdn.net/download/qq_36393978/85329905

1. 编译阶段

常使用的编译工具链gcc在编译阶段需要调用工具链中的“预处理器”、“编译器”、“汇编器”,其处理过程包含流程如下。
[注]:GIMPLE代码也可称为中间代码,寄存器传输语言(Register Transfer Language,RTL)
在这里插入图片描述

1.1. 程序的预处理

即源文件中的处理各个预处理命令(头文件的包含、自定义宏和内置宏的展开、条件编译、程序编译的控制语句)、删除注释以及添加行号和文件名标识,具体包含哪些内置宏以及哪些需要处理的预处理命令,我们可以从源码中看到;
gcc-12.1.0提供的内置宏如下:

//# gcc-12.1.0/libcpp/init.cc	##------## struct builtin_macro builtin_array[]
struct builtin_macro
{
    
    
  const uchar *const name;
  const unsigned short len;
  const unsigned short value;
  const bool always_warn_if_redefined;
};

#define B(n, t, f)    {
      
       DSC(n), t, f }
static const struct builtin_macro builtin_array[] =
{
    
    
  B("__TIMESTAMP__",	 BT_TIMESTAMP,     false),
  B("__TIME__",		 BT_TIME,          false),
  B("__DATE__",		 BT_DATE,          false),
  B("__FILE__",		 BT_FILE,          false),
  B("__FILE_NAME__",	 BT_FILE_NAME,     false),
  B("__BASE_FILE__",	 BT_BASE_FILE,     false),
  B("__LINE__",		 BT_SPECLINE,      true),
  B("__INCLUDE_LEVEL__", BT_INCLUDE_LEVEL, true),
  B("__COUNTER__",	 BT_COUNTER,       true),
  /* Make sure to update the list of built-in
     function-like macros in traditional.cc:
     fun_like_macro() when adding more following */
  B("__has_attribute",	 BT_HAS_ATTRIBUTE, true),
  B("__has_c_attribute", BT_HAS_STD_ATTRIBUTE, true),
  B("__has_cpp_attribute", BT_HAS_ATTRIBUTE, true),
  B("__has_builtin",	 BT_HAS_BUILTIN,   true),
  B("__has_include",	 BT_HAS_INCLUDE,   true),
  B("__has_include_next",BT_HAS_INCLUDE_NEXT,   true),
  /* Keep builtins not used for -traditional-cpp at the end, and
     update init_builtins() if any more are added.  */
  B("_Pragma",		 BT_PRAGMA,        true),
  B("__STDC__",		 BT_STDC,          true),
};
#undef B

gcc-12.1.0中需要处理的预处理命令如下,其对应的处理函数则被放进btable这个结构体数组里:

//# gcc12.1.0/libcpp/directives.cc

typedef struct directive directive;
struct directive
{
    
    
  directive_handler handler;	/* Function to handle directive.  */
  const uchar *name;		/* Name of directive.  */
  unsigned short length;	/* Length of name.  */
  unsigned char origin;		/* Origin of directive.  */
  unsigned char flags;	        /* Flags describing this directive.  */
};

/* This is the table of directive handlers.  All extensions other than
   #warning, #include_next, and #import are deprecated.  The name is
   where the extension appears to have come from.  */

#define DIRECTIVE_TABLE							\
  D(define,	T_DEFINE = 0,	KANDR,     IN_I)			\
  D(include,	T_INCLUDE,	KANDR,     INCL | EXPAND)		\
  D(endif,	T_ENDIF,	KANDR,     COND)			\
  D(ifdef,	T_IFDEF,	KANDR,     COND | IF_COND)		\
  D(if,		T_IF,		KANDR, 	   COND | IF_COND | EXPAND) 	\
  D(else,	T_ELSE,		KANDR,     COND)	   		\
  D(ifndef,	T_IFNDEF,	KANDR,     COND | IF_COND)		\
  D(undef,	T_UNDEF,	KANDR,     IN_I)			\
  D(line,	T_LINE,		KANDR,     EXPAND)			\
  D(elif,	T_ELIF,		STDC89,    COND | EXPAND)		\
  D(elifdef,	T_ELIFDEF,	STDC2X,    COND | ELIFDEF)		\
  D(elifndef,	T_ELIFNDEF,	STDC2X,    COND | ELIFDEF)		\
  D(error,	T_ERROR,	STDC89,    0)				\
  D(pragma,	T_PRAGMA,	STDC89,    IN_I)			\
  D(warning,	T_WARNING,	EXTENSION, 0)				\
  D(include_next, T_INCLUDE_NEXT, EXTENSION, INCL | EXPAND)		\
  D(ident,	T_IDENT,	EXTENSION, IN_I)			\
  D(import,	T_IMPORT,	EXTENSION, INCL | EXPAND)  /* ObjC */	\
  D(assert,	T_ASSERT,	EXTENSION, DEPRECATED)	   /* SVR4 */	\
  D(unassert,	T_UNASSERT,	EXTENSION, DEPRECATED)	   /* SVR4 */	\
  D(sccs,	T_SCCS,		EXTENSION, IN_I)   	   /*  SVR4? */

#define D(name, t, origin, flags) \
{
      
       do_##name, (const uchar *) #name, \
  sizeof #name - 1, origin, flags },
static const directive dtable[] =
{
    
    
DIRECTIVE_TABLE
};
#undef D

1.1.1. 小知识(宏的多态)

这里才华横溢的小伙伴们可能会对这里的宏定义如BD这些,为什么#define定义了之后,又接着#undef取消定义??
这里是为了实现宏的多态,即在不同地方、不同时候引用时,可以拥有不同的性质,产生不同的结果。若还是不能理解的小伙伴可以看下面这个简单的例子就一目了然了。

#include <stdio.h>

#define NUM_COUNT    \
    NUM(1)  \
    NUM(2)  \
    NUM(3)  \
    NUM(4)


void print_num(int num)
{
    
    
    printf("%d\n", num);

    return ;
}

void num_increase(int num)
{
    
    
    printf("%d\n", num + 1);

    return ;
}


int main(int argc, char **argv)
{
    
    

#define NUM(num)    print_num(num);
    NUM_COUNT
#undef NUM

        printf("==============\n");

#define NUM(num)    num_increase(num);
    NUM_COUNT
#undef NUM

    return 0;
}

其执行结果如下:

imaginemiracle@ubuntu:define$ ./a.out 
1
2
3
4
==============
2
3
4
5

1.2. 词法分析

1.2.1. 词法单元 (token)

词法分析器将源代码读取,识别出源代码中的每个词法符号,并构造一个词法单元 token 来保存该符号,该过程主要由 _cpp_lex_token() 函数完成(该函数位于 gcc12.1.0/libcpp/lex.cc 中),之后会将每个 token 进行检查并输出词法单元 token 序列。在词法分析器的源码中 token 实际由一个结构体表示。目前最新的 gcc-12.1.0 中对 c 语言的词法单元 token 的定义为 struct GTY(()) c_token,该结构体用于保存一个基本的词法单元,该结构体的定义中有个 GTY(()),是给该结构体加上 GTY 表示,其本身是一个宏定义,主要在后期的垃圾管理机制中使用到,这里不做讨论;

//# gcc-12.1.0/gcc/c/c-parser.h	##------## struct c_token
/* A single C token after string literal concatenation and conversion
   of preprocessing tokens to tokens.  */
struct GTY (()) c_token {
    
    
  /* The kind of token.  */
  ENUM_BITFIELD (cpp_ttype) type : 8;
  /* If this token is a CPP_NAME, this value indicates whether also
     declared as some kind of type.  Otherwise, it is C_ID_NONE.  */
  ENUM_BITFIELD (c_id_kind) id_kind : 8;
  /* If this token is a keyword, this value indicates which keyword.
     Otherwise, this value is RID_MAX.  */
  ENUM_BITFIELD (rid) keyword : 8;
  /* If this token is a CPP_PRAGMA, this indicates the pragma that
     was seen.  Otherwise it is PRAGMA_NONE.  */
  ENUM_BITFIELD (pragma_kind) pragma_kind : 8;
  /* The location at which this token was found.  */
  location_t location;
  /* The value associated with this token, if any.  */
  tree value;
  /* Token flags.  */
  unsigned char flags;

  source_range get_range () const
  {
    
    
    return get_range_from_loc (line_table, location);
  }

  location_t get_finish () const
  {
    
    
    return get_range ().m_finish;
  }
};

而对 c++ 语言的词法单元 token 的定义为 struct GTY(()) cpp_token。本文主要以 c 语言为例来分析整个编译流程。

//# gcc-12.1.0/libcpp/include/cpplib.h	##------## struct cpp_token

/* A preprocessing token.  This has been carefully packed and should
   occupy 16 bytes on 32-bit hosts and 24 bytes on 64-bit hosts.  */
struct GTY(()) cpp_token {
    
    

  /* Location of first char of token, together with range of full token.  */
  location_t src_loc;

  ENUM_BITFIELD(cpp_ttype) type : CHAR_BIT;  /* token type */
  unsigned short flags;		/* flags - see above */

  union cpp_token_u
  {
    
    
    /* An identifier.  */
    struct cpp_identifier GTY ((tag ("CPP_TOKEN_FLD_NODE"))) node;
	 
    /* Inherit padding from this token.  */
    cpp_token * GTY ((tag ("CPP_TOKEN_FLD_SOURCE"))) source;

    /* A string, or number.  */
    struct cpp_string GTY ((tag ("CPP_TOKEN_FLD_STR"))) str;

    /* Argument no. (and original spelling) for a CPP_MACRO_ARG.  */
    struct cpp_macro_arg GTY ((tag ("CPP_TOKEN_FLD_ARG_NO"))) macro_arg;

    /* Original token no. for a CPP_PASTE (from a sequence of
       consecutive paste tokens in a macro expansion).  */
    unsigned int GTY ((tag ("CPP_TOKEN_FLD_TOKEN_NO"))) token_no;

    /* Caller-supplied identifier for a CPP_PRAGMA.  */
    unsigned int GTY ((tag ("CPP_TOKEN_FLD_PRAGMA"))) pragma;
  } GTY ((desc ("cpp_token_val_index (&%1)"))) val;
};

struct c_token 结构体主要包括符号类型( type )、标识符类型( id_kind )、关键字( keyword )、PRAGMA类型( pragma_kind )、符号的值( value ) 以及用来描述该符号在源文件中位置的 ( location )。

1.2.2. 符号类型 (c_token::type)

gcc 中定义了一系列的操作符号,如常规运算符 =+-><等,分隔符 :,;{ }等,这类的操作符号的定义以及符号 value,均由 OP 宏指定,还有一些其它类型符号是由 TK 宏指定,主要包含 EOFNAMECHARSTRINGPRAGMA等,这些符号均被定义在 ./libcpp/iclude/cpplib.h 文件中的 enum cpp_ttype 枚举变量中。

1.2.3. 标识符类型 (c_token::id_kind)

struct c_tokent 结构体的定义处注释,我们可以了解到,当其中 type 的值不是 CPP_NAME,此时 id_kind 的值为 C_IN_NONE,标识该类型不是一个标识符;相反,当 type 的值为 CPP_NAME 时,id_kind 将会有效(即会被赋有意义的值),此时则代表该类型是个标识符类型。CPP_NAME 的类型包含普通标识符(C_ID_ID)、类型名称(C_ID_TYPENAME)、Objective-C中对象名称(C_ID_CLASSNAME)、地址空间类型(C_ID_ADDRSPACE)。

//# gcc-12.1.0/gcc/c/c-parser.h	##------## enum c_id_kind
/* More information about the type of a CPP_NAME token.  */
enum c_id_kind {
    
    
  /* An ordinary identifier.  */
  /* 普通标识符 */
  C_ID_ID,
  /* An identifier declared as a typedef name.  */
  /* 描述类型的标识符,即用 typedef 定义的类型 */
  C_ID_TYPENAME,
  /* An identifier declared as an Objective-C class name.  */
  /* Objective-C 中的对象名称标识符 */
  C_ID_CLASSNAME,
  /* An address space identifier.  */
  /* 地址空间标识符 */
  C_ID_ADDRSPACE,
  /* Not an identifier.  */
  /* 不是标识符 */
  C_ID_NONE
};

1.2.4. 关键字标识 (c_token::keyword)

当编码中需要使用一个标识符表示代码中的关键字(Key Workds)时,就必须确定好这些关键字的具体值,由此来确定该关键字所代表的具体含义。而 c 语言所使用的所有关键字以及所对应的值都被定义在了 ./gcc/c-family/c-common.h 文件的 enum rid 枚举变量中。

//# gcc-12.1.0/gcc/c-family/c-common.h	##------## enum rid

/* Reserved identifiers.  This is the union of all the keywords for C,
   C++, and Objective-C.  All the type modifiers have to be in one
   block at the beginning, because they are used as mask bits.  There
   are 28 type modifiers; if we add many more we will have to redesign
   the mask mechanism.  */

enum rid
{
    
    
  /* Modifiers: */
  /* C, in empirical order of frequency.  */
  RID_STATIC = 0,
  RID_UNSIGNED, RID_LONG,    RID_CONST, RID_EXTERN,
  RID_REGISTER, RID_TYPEDEF, RID_SHORT, RID_INLINE,
  RID_VOLATILE, RID_SIGNED,  RID_AUTO,  RID_RESTRICT,
  RID_NORETURN, RID_ATOMIC,

  /* C extensions */
  RID_COMPLEX, RID_THREAD, RID_SAT,

  /* C++ */
  RID_FRIEND, RID_VIRTUAL, RID_EXPLICIT, RID_EXPORT, RID_MUTABLE,

  /* ObjC ("PQ" reserved words - they do not appear after a '@' and
     are keywords only in specific contexts)  */
  RID_IN, RID_OUT, RID_INOUT, RID_BYCOPY, RID_BYREF, RID_ONEWAY,

  /* ObjC ("PATTR" reserved words - they do not appear after a '@' 
     and are keywords only as property attributes)  */
  RID_GETTER, RID_SETTER,
  RID_READONLY, RID_READWRITE,
  RID_ASSIGN, RID_RETAIN, RID_COPY,
  RID_PROPATOMIC, RID_NONATOMIC,

  /* ObjC nullability support keywords that also can appear in the
     property attribute context.  These values should remain contiguous
     with the other property attributes.  */
  RID_NULL_UNSPECIFIED, RID_NULLABLE, RID_NONNULL, RID_NULL_RESETTABLE,

  /* C (reserved and imaginary types not implemented, so any use is a
     syntax error) */
  RID_IMAGINARY,

  /* C */
  RID_INT,     RID_CHAR,   RID_FLOAT,    RID_DOUBLE, RID_VOID,
  RID_ENUM,    RID_STRUCT, RID_UNION,    RID_IF,     RID_ELSE,
  RID_WHILE,   RID_DO,     RID_FOR,      RID_SWITCH, RID_CASE,
  RID_DEFAULT, RID_BREAK,  RID_CONTINUE, RID_RETURN, RID_GOTO,
  RID_SIZEOF,

  /* C extensions */
  RID_ASM,       RID_TYPEOF,   RID_ALIGNOF,  RID_ATTRIBUTE,  RID_VA_ARG,
  RID_EXTENSION, RID_IMAGPART, RID_REALPART, RID_LABEL,      RID_CHOOSE_EXPR,
  RID_TYPES_COMPATIBLE_P,      RID_BUILTIN_COMPLEX,	     RID_BUILTIN_SHUFFLE,
  RID_BUILTIN_SHUFFLEVECTOR,   RID_BUILTIN_CONVERTVECTOR,   RID_BUILTIN_TGMATH,
  RID_BUILTIN_HAS_ATTRIBUTE,   RID_BUILTIN_ASSOC_BARRIER,
  RID_DFLOAT32, RID_DFLOAT64, RID_DFLOAT128,

  /* TS 18661-3 keywords, in the same sequence as the TI_* values.  */
  RID_FLOAT16,
  RID_FLOATN_NX_FIRST = RID_FLOAT16,
  RID_FLOAT32,
  RID_FLOAT64,
  RID_FLOAT128,
  RID_FLOAT32X,
  RID_FLOAT64X,
  RID_FLOAT128X,
#define CASE_RID_FLOATN_NX						\
  case RID_FLOAT16: case RID_FLOAT32: case RID_FLOAT64: case RID_FLOAT128: \
  case RID_FLOAT32X: case RID_FLOAT64X: case RID_FLOAT128X

  RID_FRACT, RID_ACCUM, RID_AUTO_TYPE, RID_BUILTIN_CALL_WITH_STATIC_CHAIN,

  /* "__GIMPLE", for the GIMPLE-parsing extension to the C frontend. */
  RID_GIMPLE,

  /* "__PHI", for parsing PHI function in GIMPLE FE.  */
  RID_PHI,

  /* "__RTL", for the RTL-parsing extension to the C frontend.  */
  RID_RTL,

  /* C11 */
  RID_ALIGNAS, RID_GENERIC,

  /* This means to warn that this is a C++ keyword, and then treat it
     as a normal identifier.  */
  RID_CXX_COMPAT_WARN,

  /* GNU transactional memory extension */
  RID_TRANSACTION_ATOMIC, RID_TRANSACTION_RELAXED, RID_TRANSACTION_CANCEL,

  /* Too many ways of getting the name of a function as a string */
  RID_FUNCTION_NAME, RID_PRETTY_FUNCTION_NAME, RID_C99_FUNCTION_NAME,

  /* C++ (some of these are keywords in Objective-C as well, but only
     if they appear after a '@') */
  RID_BOOL,     RID_WCHAR,    RID_CLASS,
  RID_PUBLIC,   RID_PRIVATE,  RID_PROTECTED,
  RID_TEMPLATE, RID_NULL,     RID_CATCH,
  RID_DELETE,   RID_FALSE,    RID_NAMESPACE,
  RID_NEW,      RID_OFFSETOF, RID_OPERATOR,
  RID_THIS,     RID_THROW,    RID_TRUE,
  RID_TRY,      RID_TYPENAME, RID_TYPEID,
  RID_USING,    RID_CHAR16,   RID_CHAR32,

  /* casts */
  RID_CONSTCAST, RID_DYNCAST, RID_REINTCAST, RID_STATCAST,

  /* C++ extensions */
  RID_ADDRESSOF,               RID_BASES,
  RID_BUILTIN_LAUNDER,         RID_DIRECT_BASES,
  RID_HAS_NOTHROW_ASSIGN,      RID_HAS_NOTHROW_CONSTRUCTOR,
  RID_HAS_NOTHROW_COPY,        RID_HAS_TRIVIAL_ASSIGN,
  RID_HAS_TRIVIAL_CONSTRUCTOR, RID_HAS_TRIVIAL_COPY,
  RID_HAS_TRIVIAL_DESTRUCTOR,  RID_HAS_UNIQUE_OBJ_REPRESENTATIONS,
  RID_HAS_VIRTUAL_DESTRUCTOR,  RID_BUILTIN_BIT_CAST,
  RID_IS_ABSTRACT,             RID_IS_AGGREGATE,
  RID_IS_BASE_OF,              RID_IS_CLASS,
  RID_IS_EMPTY,                RID_IS_ENUM,
  RID_IS_FINAL,                RID_IS_LAYOUT_COMPATIBLE,
  RID_IS_LITERAL_TYPE,
  RID_IS_POINTER_INTERCONVERTIBLE_BASE_OF,
  RID_IS_POD,                  RID_IS_POLYMORPHIC,
  RID_IS_SAME_AS,
  RID_IS_STD_LAYOUT,           RID_IS_TRIVIAL,
  RID_IS_TRIVIALLY_ASSIGNABLE, RID_IS_TRIVIALLY_CONSTRUCTIBLE,
  RID_IS_TRIVIALLY_COPYABLE,
  RID_IS_UNION,                RID_UNDERLYING_TYPE,
  RID_IS_ASSIGNABLE,           RID_IS_CONSTRUCTIBLE,
  RID_IS_NOTHROW_ASSIGNABLE,   RID_IS_NOTHROW_CONSTRUCTIBLE,

  /* C++11 */
  RID_CONSTEXPR, RID_DECLTYPE, RID_NOEXCEPT, RID_NULLPTR, RID_STATIC_ASSERT,

  /* C++20 */
  RID_CONSTINIT, RID_CONSTEVAL,

  /* char8_t */
  RID_CHAR8,

  /* C++ concepts */
  RID_CONCEPT, RID_REQUIRES,

  /* C++ modules.  */
  RID__MODULE, RID__IMPORT, RID__EXPORT, /* Internal tokens.  */

  /* C++ coroutines */
  RID_CO_AWAIT, RID_CO_YIELD, RID_CO_RETURN,

  /* C++ transactional memory.  */
  RID_ATOMIC_NOEXCEPT, RID_ATOMIC_CANCEL, RID_SYNCHRONIZED,

  /* Objective-C ("AT" reserved words - they are only keywords when
     they follow '@')  */
  RID_AT_ENCODE,   RID_AT_END,
  RID_AT_CLASS,    RID_AT_ALIAS,     RID_AT_DEFS,
  RID_AT_PRIVATE,  RID_AT_PROTECTED, RID_AT_PUBLIC,  RID_AT_PACKAGE,
  RID_AT_PROTOCOL, RID_AT_SELECTOR,
  RID_AT_THROW,	   RID_AT_TRY,       RID_AT_CATCH,
  RID_AT_FINALLY,  RID_AT_SYNCHRONIZED, 
  RID_AT_OPTIONAL, RID_AT_REQUIRED, RID_AT_PROPERTY,
  RID_AT_SYNTHESIZE, RID_AT_DYNAMIC,
  RID_AT_INTERFACE,
  RID_AT_IMPLEMENTATION,

  /* Named address support, mapping the keyword to a particular named address
     number.  Named address space 0 is reserved for the generic address.  If
     there are more than 254 named addresses, the addr_space_t type will need
     to be grown from an unsigned char to unsigned short.  */
  RID_ADDR_SPACE_0,		/* generic address */
  RID_ADDR_SPACE_1,
  RID_ADDR_SPACE_2,
  RID_ADDR_SPACE_3,
  RID_ADDR_SPACE_4,
  RID_ADDR_SPACE_5,
  RID_ADDR_SPACE_6,
  RID_ADDR_SPACE_7,
  RID_ADDR_SPACE_8,
  RID_ADDR_SPACE_9,
  RID_ADDR_SPACE_10,
  RID_ADDR_SPACE_11,
  RID_ADDR_SPACE_12,
  RID_ADDR_SPACE_13,
  RID_ADDR_SPACE_14,
  RID_ADDR_SPACE_15,

  RID_FIRST_ADDR_SPACE = RID_ADDR_SPACE_0,
  RID_LAST_ADDR_SPACE = RID_ADDR_SPACE_15,

  /* __intN keywords.  The _N_M here doesn't correspond to the intN
     in the keyword; use the bitsize in int_n_t_data_t[M] for that.
     For example, if int_n_t_data_t[0].bitsize is 13, then RID_INT_N_0
     is for __int13.  */

  /* Note that the range to use is RID_FIRST_INT_N through
     RID_FIRST_INT_N + NUM_INT_N_ENTS - 1 and c-parser.cc has a list of
     all RID_INT_N_* in a case statement.  */

  RID_INT_N_0,
  RID_INT_N_1,
  RID_INT_N_2,
  RID_INT_N_3,

  RID_FIRST_INT_N = RID_INT_N_0,
  RID_LAST_INT_N = RID_INT_N_3,

  RID_MAX,

  RID_FIRST_MODIFIER = RID_STATIC,
  RID_LAST_MODIFIER = RID_ONEWAY,

  RID_FIRST_CXX11 = RID_CONSTEXPR,
  RID_LAST_CXX11 = RID_STATIC_ASSERT,
  RID_FIRST_CXX20 = RID_CONSTINIT,
  RID_LAST_CXX20 = RID_CONSTINIT,
  RID_FIRST_AT = RID_AT_ENCODE,
  RID_LAST_AT = RID_AT_IMPLEMENTATION,
  RID_FIRST_PQ = RID_IN,
  RID_LAST_PQ = RID_ONEWAY,
  RID_FIRST_PATTR = RID_GETTER,
  RID_LAST_PATTR = RID_NULL_RESETTABLE
};

从上面代码可以看到 GCC 对所支持的所有关键字编写了一个枚举用来表示与之相对应关键字的具体值,而对于关键字的声明则放在了 ./gcc/c-family/c-common.cc 文件中的 c_common_reswords 结构体数组中。

//# gcc-12.1.0/gcc/c-family/c-common.cc	##------## const struct c_common_resword c_common_reswords[]

/* Reserved words.  The third field is a mask: keywords are disabled
   if they match the mask.

   Masks for languages:
   C --std=c89: D_C99 | D_CXXONLY | D_OBJC | D_CXX_OBJC
   C --std=c99: D_CXXONLY | D_OBJC
   ObjC is like C except that D_OBJC and D_CXX_OBJC are not set
   C++ --std=c++98: D_CONLY | D_CXX11 | D_CXX20 | D_OBJC
   C++ --std=c++11: D_CONLY | D_CXX20 | D_OBJC
   C++ --std=c++20: D_CONLY | D_OBJC
   ObjC++ is like C++ except that D_OBJC is not set

   If -fno-asm is used, D_ASM is added to the mask.  If
   -fno-gnu-keywords is used, D_EXT is added.  If -fno-asm and C in
   C89 mode, D_EXT89 is added for both -fno-asm and -fno-gnu-keywords.
   In C with -Wc++-compat, we warn if D_CXXWARN is set.

   Note the complication of the D_CXX_OBJC keywords.  These are
   reserved words such as 'class'.  In C++, 'class' is a reserved
   word.  In Objective-C++ it is too.  In Objective-C, it is a
   reserved word too, but only if it follows an '@' sign.
*/
const struct c_common_resword c_common_reswords[] =
{
    
    
  {
    
     "_Alignas",		RID_ALIGNAS,   D_CONLY },
  {
    
     "_Alignof",		RID_ALIGNOF,   D_CONLY },
  {
    
     "_Atomic",		RID_ATOMIC,    D_CONLY },
  {
    
     "_Bool",		RID_BOOL,      D_CONLY },
  {
    
     "_Complex",		RID_COMPLEX,	0 },
  {
    
     "_Imaginary",	RID_IMAGINARY, D_CONLY },
  {
    
     "_Float16",         RID_FLOAT16,   D_CONLY },
  {
    
     "_Float32",         RID_FLOAT32,   D_CONLY },
  {
    
     "_Float64",         RID_FLOAT64,   D_CONLY },
  {
    
     "_Float128",        RID_FLOAT128,  D_CONLY },
  {
    
     "_Float32x",        RID_FLOAT32X,  D_CONLY },
  {
    
     "_Float64x",        RID_FLOAT64X,  D_CONLY },
  {
    
     "_Float128x",       RID_FLOAT128X, D_CONLY },
  {
    
     "_Decimal32",       RID_DFLOAT32,  D_CONLY },
  {
    
     "_Decimal64",       RID_DFLOAT64,  D_CONLY },
  {
    
     "_Decimal128",      RID_DFLOAT128, D_CONLY },
  {
    
     "_Fract",           RID_FRACT,     D_CONLY | D_EXT },
  {
    
     "_Accum",           RID_ACCUM,     D_CONLY | D_EXT },
  {
    
     "_Sat",             RID_SAT,       D_CONLY | D_EXT },
  {
    
     "_Static_assert",   RID_STATIC_ASSERT, D_CONLY },
  {
    
     "_Noreturn",        RID_NORETURN,  D_CONLY },
  {
    
     "_Generic",         RID_GENERIC,   D_CONLY },
  {
    
     "_Thread_local",    RID_THREAD,    D_CONLY },
  {
    
     "__FUNCTION__",	RID_FUNCTION_NAME, 0 },
  {
    
     "__PRETTY_FUNCTION__", RID_PRETTY_FUNCTION_NAME, 0 },
  {
    
     "__alignof",	RID_ALIGNOF,	0 },
  {
    
     "__alignof__",	RID_ALIGNOF,	0 },
  {
    
     "__asm",		RID_ASM,	0 },
  {
    
     "__asm__",		RID_ASM,	0 },
  {
    
     "__attribute",	RID_ATTRIBUTE,	0 },
  {
    
     "__attribute__",	RID_ATTRIBUTE,	0 },
  {
    
     "__auto_type",	RID_AUTO_TYPE,	D_CONLY },
  {
    
     "__bases",          RID_BASES, D_CXXONLY },
  {
    
     "__builtin_addressof", RID_ADDRESSOF, D_CXXONLY },
  {
    
     "__builtin_bit_cast", RID_BUILTIN_BIT_CAST, D_CXXONLY },
  {
    
     "__builtin_call_with_static_chain",
    RID_BUILTIN_CALL_WITH_STATIC_CHAIN, D_CONLY },
  {
    
     "__builtin_choose_expr", RID_CHOOSE_EXPR, D_CONLY },
  {
    
     "__builtin_complex", RID_BUILTIN_COMPLEX, D_CONLY },
  {
    
     "__builtin_convertvector", RID_BUILTIN_CONVERTVECTOR, 0 },
  {
    
     "__builtin_has_attribute", RID_BUILTIN_HAS_ATTRIBUTE, 0 },
  {
    
     "__builtin_launder", RID_BUILTIN_LAUNDER, D_CXXONLY },
  {
    
     "__builtin_assoc_barrier", RID_BUILTIN_ASSOC_BARRIER, 0 },
  {
    
     "__builtin_shuffle", RID_BUILTIN_SHUFFLE, 0 },
  {
    
     "__builtin_shufflevector", RID_BUILTIN_SHUFFLEVECTOR, 0 },
  {
    
     "__builtin_tgmath", RID_BUILTIN_TGMATH, D_CONLY },
  {
    
     "__builtin_offsetof", RID_OFFSETOF, 0 },
  {
    
     "__builtin_types_compatible_p", RID_TYPES_COMPATIBLE_P, D_CONLY },
  {
    
     "__builtin_va_arg",	RID_VA_ARG,	0 },
  {
    
     "__complex",	RID_COMPLEX,	0 },
  {
    
     "__complex__",	RID_COMPLEX,	0 },
  {
    
     "__const",		RID_CONST,	0 },
  {
    
     "__const__",	RID_CONST,	0 },
  {
    
     "__constinit",	RID_CONSTINIT,	D_CXXONLY },
  {
    
     "__decltype",       RID_DECLTYPE,   D_CXXONLY },
  {
    
     "__direct_bases",   RID_DIRECT_BASES, D_CXXONLY },
  {
    
     "__extension__",	RID_EXTENSION,	0 },
  {
    
     "__func__",		RID_C99_FUNCTION_NAME, 0 },
  {
    
     "__has_nothrow_assign", RID_HAS_NOTHROW_ASSIGN, D_CXXONLY },
  {
    
     "__has_nothrow_constructor", RID_HAS_NOTHROW_CONSTRUCTOR, D_CXXONLY },
  {
    
     "__has_nothrow_copy", RID_HAS_NOTHROW_COPY, D_CXXONLY },
  {
    
     "__has_trivial_assign", RID_HAS_TRIVIAL_ASSIGN, D_CXXONLY },
  {
    
     "__has_trivial_constructor", RID_HAS_TRIVIAL_CONSTRUCTOR, D_CXXONLY },
  {
    
     "__has_trivial_copy", RID_HAS_TRIVIAL_COPY, D_CXXONLY },
  {
    
     "__has_trivial_destructor", RID_HAS_TRIVIAL_DESTRUCTOR, D_CXXONLY },
  {
    
     "__has_unique_object_representations", RID_HAS_UNIQUE_OBJ_REPRESENTATIONS,
					D_CXXONLY },
  {
    
     "__has_virtual_destructor", RID_HAS_VIRTUAL_DESTRUCTOR, D_CXXONLY },
  {
    
     "__imag",		RID_IMAGPART,	0 },
  {
    
     "__imag__",		RID_IMAGPART,	0 },
  {
    
     "__inline",		RID_INLINE,	0 },
  {
    
     "__inline__",	RID_INLINE,	0 },
  {
    
     "__is_abstract",	RID_IS_ABSTRACT, D_CXXONLY },
  {
    
     "__is_aggregate",	RID_IS_AGGREGATE, D_CXXONLY },
  {
    
     "__is_base_of",	RID_IS_BASE_OF, D_CXXONLY },
  {
    
     "__is_class",	RID_IS_CLASS,	D_CXXONLY },
  {
    
     "__is_empty",	RID_IS_EMPTY,	D_CXXONLY },
  {
    
     "__is_enum",	RID_IS_ENUM,	D_CXXONLY },
  {
    
     "__is_final",	RID_IS_FINAL,	D_CXXONLY },
  {
    
     "__is_layout_compatible", RID_IS_LAYOUT_COMPATIBLE, D_CXXONLY },
  {
    
     "__is_literal_type", RID_IS_LITERAL_TYPE, D_CXXONLY },
  {
    
     "__is_pointer_interconvertible_base_of",
			RID_IS_POINTER_INTERCONVERTIBLE_BASE_OF, D_CXXONLY },
  {
    
     "__is_pod",		RID_IS_POD,	D_CXXONLY },
  {
    
     "__is_polymorphic",	RID_IS_POLYMORPHIC, D_CXXONLY },
  {
    
     "__is_same",     RID_IS_SAME_AS, D_CXXONLY },
  {
    
     "__is_same_as",     RID_IS_SAME_AS, D_CXXONLY },
  {
    
     "__is_standard_layout", RID_IS_STD_LAYOUT, D_CXXONLY },
  {
    
     "__is_trivial",     RID_IS_TRIVIAL, D_CXXONLY },
  {
    
     "__is_trivially_assignable", RID_IS_TRIVIALLY_ASSIGNABLE, D_CXXONLY },
  {
    
     "__is_trivially_constructible", RID_IS_TRIVIALLY_CONSTRUCTIBLE, D_CXXONLY },
  {
    
     "__is_trivially_copyable", RID_IS_TRIVIALLY_COPYABLE, D_CXXONLY },
  {
    
     "__is_union",	RID_IS_UNION,	D_CXXONLY },
  {
    
     "__label__",	RID_LABEL,	0 },
  {
    
     "__null",		RID_NULL,	0 },
  {
    
     "__real",		RID_REALPART,	0 },
  {
    
     "__real__",		RID_REALPART,	0 },
  {
    
     "__restrict",	RID_RESTRICT,	0 },
  {
    
     "__restrict__",	RID_RESTRICT,	0 },
  {
    
     "__signed",		RID_SIGNED,	0 },
  {
    
     "__signed__",	RID_SIGNED,	0 },
  {
    
     "__thread",		RID_THREAD,	0 },
  {
    
     "__transaction_atomic", RID_TRANSACTION_ATOMIC, 0 },
  {
    
     "__transaction_relaxed", RID_TRANSACTION_RELAXED, 0 },
  {
    
     "__transaction_cancel", RID_TRANSACTION_CANCEL, 0 },
  {
    
     "__typeof",		RID_TYPEOF,	0 },
  {
    
     "__typeof__",	RID_TYPEOF,	0 },
  {
    
     "__underlying_type", RID_UNDERLYING_TYPE, D_CXXONLY },
  {
    
     "__volatile",	RID_VOLATILE,	0 },
  {
    
     "__volatile__",	RID_VOLATILE,	0 },
  {
    
     "__GIMPLE",		RID_GIMPLE,	D_CONLY },
  {
    
     "__PHI",		RID_PHI,	D_CONLY },
  {
    
     "__RTL",		RID_RTL,	D_CONLY },
  {
    
     "alignas",		RID_ALIGNAS,	D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "alignof",		RID_ALIGNOF,	D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "asm",		RID_ASM,	D_ASM },
  {
    
     "auto",		RID_AUTO,	0 },
  {
    
     "bool",		RID_BOOL,	D_CXXONLY | D_CXXWARN },
  {
    
     "break",		RID_BREAK,	0 },
  {
    
     "case",		RID_CASE,	0 },
  {
    
     "catch",		RID_CATCH,	D_CXX_OBJC | D_CXXWARN },
  {
    
     "char",		RID_CHAR,	0 },
  {
    
     "char8_t",		RID_CHAR8,	D_CXX_CHAR8_T_FLAGS | D_CXXWARN },
  {
    
     "char16_t",		RID_CHAR16,	D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "char32_t",		RID_CHAR32,	D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "class",		RID_CLASS,	D_CXX_OBJC | D_CXXWARN },
  {
    
     "const",		RID_CONST,	0 },
  {
    
     "consteval",	RID_CONSTEVAL,	D_CXXONLY | D_CXX20 | D_CXXWARN },
  {
    
     "constexpr",	RID_CONSTEXPR,	D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "constinit",	RID_CONSTINIT,	D_CXXONLY | D_CXX20 | D_CXXWARN },
  {
    
     "const_cast",	RID_CONSTCAST,	D_CXXONLY | D_CXXWARN },
  {
    
     "continue",		RID_CONTINUE,	0 },
  {
    
     "decltype",         RID_DECLTYPE,   D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "default",		RID_DEFAULT,	0 },
  {
    
     "delete",		RID_DELETE,	D_CXXONLY | D_CXXWARN },
  {
    
     "do",		RID_DO,		0 },
  {
    
     "double",		RID_DOUBLE,	0 },
  {
    
     "dynamic_cast",	RID_DYNCAST,	D_CXXONLY | D_CXXWARN },
  {
    
     "else",		RID_ELSE,	0 },
  {
    
     "enum",		RID_ENUM,	0 },
  {
    
     "explicit",		RID_EXPLICIT,	D_CXXONLY | D_CXXWARN },
  {
    
     "export",		RID_EXPORT,	D_CXXONLY | D_CXXWARN },
  {
    
     "extern",		RID_EXTERN,	0 },
  {
    
     "false",		RID_FALSE,	D_CXXONLY | D_CXXWARN },
  {
    
     "float",		RID_FLOAT,	0 },
  {
    
     "for",		RID_FOR,	0 },
  {
    
     "friend",		RID_FRIEND,	D_CXXONLY | D_CXXWARN },
  {
    
     "goto",		RID_GOTO,	0 },
  {
    
     "if",		RID_IF,		0 },
  {
    
     "inline",		RID_INLINE,	D_EXT89 },
  {
    
     "int",		RID_INT,	0 },
  {
    
     "long",		RID_LONG,	0 },
  {
    
     "mutable",		RID_MUTABLE,	D_CXXONLY | D_CXXWARN },
  {
    
     "namespace",	RID_NAMESPACE,	D_CXXONLY | D_CXXWARN },
  {
    
     "new",		RID_NEW,	D_CXXONLY | D_CXXWARN },
  {
    
     "noexcept",		RID_NOEXCEPT,	D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "nullptr",		RID_NULLPTR,	D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "operator",		RID_OPERATOR,	D_CXXONLY | D_CXXWARN },
  {
    
     "private",		RID_PRIVATE,	D_CXX_OBJC | D_CXXWARN },
  {
    
     "protected",	RID_PROTECTED,	D_CXX_OBJC | D_CXXWARN },
  {
    
     "public",		RID_PUBLIC,	D_CXX_OBJC | D_CXXWARN },
  {
    
     "register",		RID_REGISTER,	0 },
  {
    
     "reinterpret_cast",	RID_REINTCAST,	D_CXXONLY | D_CXXWARN },
  {
    
     "restrict",		RID_RESTRICT,	D_CONLY | D_C99 },
  {
    
     "return",		RID_RETURN,	0 },
  {
    
     "short",		RID_SHORT,	0 },
  {
    
     "signed",		RID_SIGNED,	0 },
  {
    
     "sizeof",		RID_SIZEOF,	0 },
  {
    
     "static",		RID_STATIC,	0 },
  {
    
     "static_assert",    RID_STATIC_ASSERT, D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "static_cast",	RID_STATCAST,	D_CXXONLY | D_CXXWARN },
  {
    
     "struct",		RID_STRUCT,	0 },
  {
    
     "switch",		RID_SWITCH,	0 },
  {
    
     "template",		RID_TEMPLATE,	D_CXXONLY | D_CXXWARN },
  {
    
     "this",		RID_THIS,	D_CXXONLY | D_CXXWARN },
  {
    
     "thread_local",	RID_THREAD,	D_CXXONLY | D_CXX11 | D_CXXWARN },
  {
    
     "throw",		RID_THROW,	D_CXX_OBJC | D_CXXWARN },
  {
    
     "true",		RID_TRUE,	D_CXXONLY | D_CXXWARN },
  {
    
     "try",		RID_TRY,	D_CXX_OBJC | D_CXXWARN },
  {
    
     "typedef",		RID_TYPEDEF,	0 },
  {
    
     "typename",		RID_TYPENAME,	D_CXXONLY | D_CXXWARN },
  {
    
     "typeid",		RID_TYPEID,	D_CXXONLY | D_CXXWARN },
  {
    
     "typeof",		RID_TYPEOF,	D_ASM | D_EXT },
  {
    
     "union",		RID_UNION,	0 },
  {
    
     "unsigned",		RID_UNSIGNED,	0 },
  {
    
     "using",		RID_USING,	D_CXXONLY | D_CXXWARN },
  {
    
     "virtual",		RID_VIRTUAL,	D_CXXONLY | D_CXXWARN },
  {
    
     "void",		RID_VOID,	0 },
  {
    
     "volatile",		RID_VOLATILE,	0 },
  {
    
     "wchar_t",		RID_WCHAR,	D_CXXONLY },
  {
    
     "while",		RID_WHILE,	0 },
  {
    
     "__is_assignable", RID_IS_ASSIGNABLE, D_CXXONLY },
  {
    
     "__is_constructible", RID_IS_CONSTRUCTIBLE, D_CXXONLY },
  {
    
     "__is_nothrow_assignable", RID_IS_NOTHROW_ASSIGNABLE, D_CXXONLY },
  {
    
     "__is_nothrow_constructible", RID_IS_NOTHROW_CONSTRUCTIBLE, D_CXXONLY },

  /* C++ transactional memory.  */
  {
    
     "synchronized",	RID_SYNCHRONIZED, D_CXX_OBJC | D_TRANSMEM },
  {
    
     "atomic_noexcept",	RID_ATOMIC_NOEXCEPT, D_CXXONLY | D_TRANSMEM },
  {
    
     "atomic_cancel",	RID_ATOMIC_CANCEL, D_CXXONLY | D_TRANSMEM },
  {
    
     "atomic_commit",	RID_TRANSACTION_ATOMIC, D_CXXONLY | D_TRANSMEM },

  /* Concepts-related keywords */
  {
    
     "concept",		RID_CONCEPT,	D_CXX_CONCEPTS_FLAGS | D_CXXWARN },
  {
    
     "requires", 	RID_REQUIRES,	D_CXX_CONCEPTS_FLAGS | D_CXXWARN },

  /* Modules-related keywords, these are internal unspellable tokens,
     created by the preprocessor.  */
  {
    
     "module ",		RID__MODULE,	D_CXX_MODULES_FLAGS | D_CXXWARN },
  {
    
     "import ",		RID__IMPORT,	D_CXX_MODULES_FLAGS | D_CXXWARN },
  {
    
     "export ",		RID__EXPORT,	D_CXX_MODULES_FLAGS | D_CXXWARN },

  /* Coroutines-related keywords */
  {
    
     "co_await",		RID_CO_AWAIT,	D_CXX_COROUTINES_FLAGS | D_CXXWARN },
  {
    
     "co_yield",		RID_CO_YIELD,	D_CXX_COROUTINES_FLAGS | D_CXXWARN },
  {
    
     "co_return", 	RID_CO_RETURN,	D_CXX_COROUTINES_FLAGS | D_CXXWARN },

  /* These Objective-C keywords are recognized only immediately after
     an '@'.  */
  {
    
     "compatibility_alias", RID_AT_ALIAS,	D_OBJC },
  {
    
     "defs",		RID_AT_DEFS,		D_OBJC },
  {
    
     "encode",		RID_AT_ENCODE,		D_OBJC },
  {
    
     "end",		RID_AT_END,		D_OBJC },
  {
    
     "implementation",	RID_AT_IMPLEMENTATION,	D_OBJC },
  {
    
     "interface",	RID_AT_INTERFACE,	D_OBJC },
  {
    
     "protocol",		RID_AT_PROTOCOL,	D_OBJC },
  {
    
     "selector",		RID_AT_SELECTOR,	D_OBJC },
  {
    
     "finally",		RID_AT_FINALLY,		D_OBJC },
  {
    
     "optional",		RID_AT_OPTIONAL,	D_OBJC },
  {
    
     "required",		RID_AT_REQUIRED,	D_OBJC },
  {
    
     "property",		RID_AT_PROPERTY,	D_OBJC },
  {
    
     "package",		RID_AT_PACKAGE,		D_OBJC },
  {
    
     "synthesize",	RID_AT_SYNTHESIZE,	D_OBJC },
  {
    
     "dynamic",		RID_AT_DYNAMIC,		D_OBJC },
  /* These are recognized only in protocol-qualifier context
     (see above) */
  {
    
     "bycopy",		RID_BYCOPY,		D_OBJC },
  {
    
     "byref",		RID_BYREF,		D_OBJC },
  {
    
     "in",		RID_IN,			D_OBJC },
  {
    
     "inout",		RID_INOUT,		D_OBJC },
  {
    
     "oneway",		RID_ONEWAY,		D_OBJC },
  {
    
     "out",		RID_OUT,		D_OBJC },
  /* These are recognized inside a property attribute list */
  {
    
     "assign",		RID_ASSIGN,		D_OBJC },
  {
    
     "atomic",		RID_PROPATOMIC,		D_OBJC },
  {
    
     "copy",		RID_COPY,		D_OBJC },
  {
    
     "getter",		RID_GETTER,		D_OBJC },
  {
    
     "nonatomic",	RID_NONATOMIC,		D_OBJC },
  {
    
     "readonly",		RID_READONLY,		D_OBJC },
  {
    
     "readwrite",	RID_READWRITE,		D_OBJC },
  {
    
     "retain",		RID_RETAIN,		D_OBJC },
  {
    
     "setter",		RID_SETTER,		D_OBJC },
  /* These are Objective C implementation of nullability, accepted only in
     specific contexts.  */
  {
    
     "null_unspecified", RID_NULL_UNSPECIFIED,	D_OBJC },
  {
    
     "nullable",		RID_NULLABLE,		D_OBJC },
  {
    
     "nonnull",		RID_NONNULL,		D_OBJC },
  {
    
     "null_resettable",	RID_NULL_RESETTABLE,	D_OBJC },
};

1.2.5. PRAGMA 类型 (c_token::pragma_kind)

当一个词法符号为 CPP_PRAGMA 时,表示该词法符号是编译制导标识,用来对编译进行制导。gcc 支持的所有 c 语言的编译制导符号以及符号值都被定义在了 ./gcc/c-family/c-pragma.h 文件的 enum pragma_kind 枚举变量中。

//# gcc-12.1.0/gcc/c-family/c-pragma.h	##------## enum pragma_kind

/* Pragma identifiers built in to the front end parsers.  Identifiers
   for ancillary handlers will follow these.  */
enum pragma_kind {
    
    
  PRAGMA_NONE = 0,

  PRAGMA_OACC_ATOMIC,
  PRAGMA_OACC_CACHE,
  PRAGMA_OACC_DATA,
  PRAGMA_OACC_DECLARE,
  PRAGMA_OACC_ENTER_DATA,
  PRAGMA_OACC_EXIT_DATA,
  PRAGMA_OACC_HOST_DATA,
  PRAGMA_OACC_KERNELS,
  PRAGMA_OACC_LOOP,
  PRAGMA_OACC_PARALLEL,
  PRAGMA_OACC_ROUTINE,
  PRAGMA_OACC_SERIAL,
  PRAGMA_OACC_UPDATE,
  PRAGMA_OACC_WAIT,

  /* PRAGMA_OMP__START_ should be equal to the first PRAGMA_OMP_* code.  */
  PRAGMA_OMP_ALLOCATE,
  PRAGMA_OMP__START_ = PRAGMA_OMP_ALLOCATE,
  PRAGMA_OMP_ATOMIC,
  PRAGMA_OMP_BARRIER,
  PRAGMA_OMP_CANCEL,
  PRAGMA_OMP_CANCELLATION_POINT,
  PRAGMA_OMP_CRITICAL,
  PRAGMA_OMP_DECLARE,
  PRAGMA_OMP_DEPOBJ,
  PRAGMA_OMP_DISTRIBUTE,
  PRAGMA_OMP_ERROR,
  PRAGMA_OMP_END_DECLARE_TARGET,
  PRAGMA_OMP_FLUSH,
  PRAGMA_OMP_FOR,
  PRAGMA_OMP_LOOP,
  PRAGMA_OMP_NOTHING,
  PRAGMA_OMP_MASKED,
  PRAGMA_OMP_MASTER,
  PRAGMA_OMP_ORDERED,
  PRAGMA_OMP_PARALLEL,
  PRAGMA_OMP_REQUIRES,
  PRAGMA_OMP_SCAN,
  PRAGMA_OMP_SCOPE,
  PRAGMA_OMP_SECTION,
  PRAGMA_OMP_SECTIONS,
  PRAGMA_OMP_SIMD,
  PRAGMA_OMP_SINGLE,
  PRAGMA_OMP_TARGET,
  PRAGMA_OMP_TASK,
  PRAGMA_OMP_TASKGROUP,
  PRAGMA_OMP_TASKLOOP,
  PRAGMA_OMP_TASKWAIT,
  PRAGMA_OMP_TASKYIELD,
  PRAGMA_OMP_THREADPRIVATE,
  PRAGMA_OMP_TEAMS,
  /* PRAGMA_OMP__LAST_ should be equal to the last PRAGMA_OMP_* code.  */
  PRAGMA_OMP__LAST_ = PRAGMA_OMP_TEAMS,

  PRAGMA_GCC_PCH_PREPROCESS,
  PRAGMA_IVDEP,
  PRAGMA_UNROLL,

  PRAGMA_FIRST_EXTERNAL
};

1.2.6. 树节点指针 (value)

对于一些词法符号,不仅需要关注其类型,还需要关注其值。如,字符串常量符号,其词法符号类型为 CPP_STRING,而该字符穿常量的值由 value 指向的字符串常量树节点给出。

1.2.7. 位置信息 (location)

location 用于描述该词法单元在源码中的位置,可以使调试人员快速的定位到错误位置。gcc./libcpp/include/line-map.h 文件中定义了获取符号位置的结构体 expanded_location 和在 ./gcc/input.h 函数 extern expanded_location expand_location (location_t);

//# gcc-12.1.0/libcpp/include/line-map.h	##------## expanded_location
typedef struct
{
    
    
  /* The name of the source file involved.  */
  const char *file;

  /* The line-location in the source file.  */
  int line;

  int column;

  void *data;

  /* In a system header?. */
  bool sysp;
} expanded_location;

//# gcc-12.1.0/gcc/input.h
extern expanded_location expand_location (location_t);

1.2.8. 示例

下面给出一个简单的词法分析器处理例子。
词法分析

1.3. 语法分析&语义分析

事实上语法分析是与词法分析伴随进行,当语法分析器需要新的词法单元 token 时,会从词法分析器获得新的词法单元tokent序列,确认该序列可由文法生成,若语法无误后,则生成语法分析树节点,最终生成完整的语法树,即抽象语法树(Abstract Syntax TreeAST),gcc生成语法分析树以及之后到汇编代码转换的过程下文会继续详述,这里先提到一点。gcc中对语法分析主要由gcc-12.1.0/gcc/c/c-parser.cc该文件中的各函数完成,其中struct c_parser结构体用于保存语法分析状态以及上下文信息、当前预读的词法符号(最多两个)等,以及语法分析的入口函数c_parse_file()

c_parser结构体的具体信息如下:

//# gcc-12.1.0/gcc/c/c-parser.cc	##------## struct c_parser

/* A parser structure recording information about the state and
   context of parsing.  Includes lexer information with up to two
   tokens of look-ahead; more are not needed for C.  */
struct GTY(()) c_parser {
    
    
  /* The look-ahead tokens.  */
  c_token * GTY((skip)) tokens;
  /* Buffer for look-ahead tokens.  */
  c_token tokens_buf[4];
  /* How many look-ahead tokens are available (0 - 4, or
     more if parsing from pre-lexed tokens).  */
  unsigned int tokens_avail;
  /* Raw look-ahead tokens, used only for checking in Objective-C
     whether '[[' starts attributes.  */
  vec<c_token, va_gc> *raw_tokens;
  /* The number of raw look-ahead tokens that have since been fully
     lexed.  */
  unsigned int raw_tokens_used;
  /* True if a syntax error is being recovered from; false otherwise.
     c_parser_error sets this flag.  It should clear this flag when
     enough tokens have been consumed to recover from the error.  */
  BOOL_BITFIELD error : 1;
  /* True if we're processing a pragma, and shouldn't automatically
     consume CPP_PRAGMA_EOL.  */
  BOOL_BITFIELD in_pragma : 1;
  /* True if we're parsing the outermost block of an if statement.  */
  BOOL_BITFIELD in_if_block : 1;
  /* True if we want to lex a translated, joined string (for an
     initial #pragma pch_preprocess).  Otherwise the parser is
     responsible for concatenating strings and translating to the
     execution character set as needed.  */
  BOOL_BITFIELD lex_joined_string : 1;
  /* True if, when the parser is concatenating string literals, it
     should translate them to the execution character set (false
     inside attributes).  */
  BOOL_BITFIELD translate_strings_p : 1;

  /* Objective-C specific parser/lexer information.  */

  /* True if we are in a context where the Objective-C "PQ" keywords
     are considered keywords.  */
  BOOL_BITFIELD objc_pq_context : 1;
  /* True if we are parsing a (potential) Objective-C foreach
     statement.  This is set to true after we parsed 'for (' and while
     we wait for 'in' or ';' to decide if it's a standard C for loop or an
     Objective-C foreach loop.  */
  BOOL_BITFIELD objc_could_be_foreach_context : 1;
  /* The following flag is needed to contextualize Objective-C lexical
     analysis.  In some cases (e.g., 'int NSObject;'), it is
     undesirable to bind an identifier to an Objective-C class, even
     if a class with that name exists.  */
  BOOL_BITFIELD objc_need_raw_identifier : 1;
  /* Nonzero if we're processing a __transaction statement.  The value
     is 1 | TM_STMT_ATTR_*.  */
  unsigned int in_transaction : 4;
  /* True if we are in a context where the Objective-C "Property attribute"
     keywords are valid.  */
  BOOL_BITFIELD objc_property_attr_context : 1;

  /* Whether we have just seen/constructed a string-literal.  Set when
     returning a string-literal from c_parser_string_literal.  Reset
     in consume_token.  Useful when we get a parse error and see an
     unknown token, which could have been a string-literal constant
     macro.  */
  BOOL_BITFIELD seen_string_literal : 1;

  /* Location of the last consumed token.  */
  location_t last_token_location;
};

语法分析的入口函数c_parse_file()具体如下:

//#gcc-12.1.0/gcc/c/c-parse.cc	##------## c_parse_file()

void
c_parse_file (void)
{
    
    
  /* Use local storage to begin.  If the first token is a pragma, parse it.
     If it is #pragma GCC pch_preprocess, then this will load a PCH file
     which will cause garbage collection.  */
  c_parser tparser;

  memset (&tparser, 0, sizeof tparser);
  tparser.translate_strings_p = true;
  tparser.tokens = &tparser.tokens_buf[0];
  the_parser = &tparser;

  /* 这里预读取一个测法符号,如果是预处理符号,则进行编译的预处理 */
  if (c_parser_peek_token (&tparser)->pragma_kind == PRAGMA_GCC_PCH_PREPROCESS)
    c_parser_pragma_pch_preprocess (&tparser);	/* 预处理 */
  else
    c_common_no_more_pch ();

  the_parser = ggc_alloc<c_parser> ();		/* 创建 c_parser 结构体 */
  *the_parser = tparser;
  if (tparser.tokens == &tparser.tokens_buf[0])
    the_parser->tokens = &the_parser->tokens_buf[0];

  /* Initialize EH, if we've been told to do so.  */
  if (flag_exceptions)
    using_eh_for_cleanups ();

  c_parser_translation_unit (the_parser);	/* 这里从C语言的语法中的翻译单元 (translation unit) 非终结符开始语法分析 */
  the_parser = NULL;
}

1.3.1. 如何输出AST(Abstract Syntax Tree)

这里给出一段极其简单的代码,大家一眼可以看出其中result将会被重新赋值为12

imaginemiracle@ubuntu:abstract_syntax_tree$ cat ast.c
int main(int argc, char **argv)
{
    
    
    int a = 10;
    int b = 20;

    int result = 0;

    result = a + b / a;

    return 0;
}

首先使用 gcc 工具生成 ASTdump 文件,即下面的 ast.c.004t.original 文件,执行命令如下:

imaginemiracle@ubuntu:abstract_syntax_tree$ gcc -fdump-tree-original-raw ast.c 
imaginemiracle@ubuntu:abstract_syntax_tree$ ls
a.out  ast.c  ast.c.004t.original

事实上到这里生成的文件已经是真正由语法分析器解析到的抽象语法树文件了,只不过是我们读起来不够方便,后面的步骤只是为了生成我们更容易理解的图示结果,我们先查看该文件里的内容:

imaginemiracle@ubuntu:abstract_syntax_tree$ cat ast.c.004t.original 

;; Function main (null)
;; enabled by -tree-original

@1      statement_list   0   : @2       1   : @3      
@2      bind_expr        type: @4       vars: @5       body: @6      
@3      return_expr      type: @4       expr: @7      
@4      void_type        name: @8       algn: 8       
@5      var_decl         name: @9       type: @10      scpe: @11     
                         srcp: ast.c:3                 init: @12     
                         size: @13      algn: 32       used: 1       
@6      statement_list   0   : @14      1   : @15      2   : @16     
                         3   : @17      4   : @18     
@7      modify_expr      type: @10      op 0: @19      op 1: @20     
@8      type_decl        name: @21      type: @4      
@9      identifier_node  strg: a        lngt: 1       
@10     integer_type     name: @22      size: @13      algn: 32      
                         prec: 32       sign: signed   min : @23     
                         max : @24     
@11     function_decl    name: @25      type: @26      srcp: ast.c:1      
                         args: @27      link: extern  
@12     integer_cst      type: @10     int: 10
@13     integer_cst      type: @28     int: 32
@14     decl_expr        type: @4      
@15     decl_expr        type: @4      
@16     decl_expr        type: @4      
@17     modify_expr      type: @10      op 0: @29      op 1: @30     
@18     return_expr      type: @4       expr: @31     
@19     result_decl      type: @10      scpe: @11      srcp: ast.c:1      
                         note: artificial              size: @13     
                         algn: 32      
@20     integer_cst      type: @10     int: 0
@21     identifier_node  strg: void     lngt: 4       
@22     type_decl        name: @32      type: @10     
@23     integer_cst      type: @10     int: -2147483648
@24     integer_cst      type: @10     int: 2147483647
@25     identifier_node  strg: main     lngt: 4       
@26     function_type    size: @33      algn: 8        retn: @10     
                         prms: @34     
@27     parm_decl        name: @35      type: @10      scpe: @11     
                         srcp: ast.c:1                 argt: @10     
                         size: @13      algn: 32       used: 0       
@28     integer_type     name: @36      size: @37      algn: 128     
                         prec: 128      sign: unsigned min : @38     
                         max : @39     
@29     var_decl         name: @40      type: @10      scpe: @11     
                         srcp: ast.c:6                 init: @20     
                         size: @13      algn: 32       used: 1       
@30     plus_expr        type: @10      op 0: @41      op 1: @5      
@31     modify_expr      type: @10      op 0: @19      op 1: @20     
@32     identifier_node  strg: int      lngt: 3       
@33     integer_cst      type: @28     int: 8
@34     tree_list        valu: @10      chan: @42     
@35     identifier_node  strg: argc     lngt: 4       
@36     identifier_node  strg: bitsizetype             lngt: 11      
@37     integer_cst      type: @28     int: 128
@38     integer_cst      type: @28     int: 0
@39     integer_cst      type: @28     int: -1
@40     identifier_node  strg: result   lngt: 6       
@41     trunc_div_expr   type: @10      op 0: @43      op 1: @5      
@42     tree_list        valu: @44      chan: @45     
@43     var_decl         name: @46      type: @10      scpe: @11     
                         srcp: ast.c:4                 init: @47     
                         size: @13      algn: 32       used: 1       
@44     pointer_type     size: @48      algn: 64       ptd : @49     
@45     tree_list        valu: @4      
@46     identifier_node  strg: b        lngt: 1       
@47     integer_cst      type: @10     int: 20
@48     integer_cst      type: @28     int: 64
@49     pointer_type     size: @48      algn: 64       ptd : @50     
@50     integer_type     name: @51      size: @33      algn: 8       
                         prec: 8        sign: signed   min : @52     
                         max : @53     
@51     type_decl        name: @54      type: @50     
@52     integer_cst      type: @50     int: -128
@53     integer_cst      type: @50     int: 127
@54     identifier_node  strg: char     lngt: 4

下面准备两个脚本文件:
pre.awk 文件,内容如下:

imaginemiracle@ubuntu:abstract_syntax_tree$ cat pre.awk
#! /usr/bin/gawk -f
/^[^;]/{
    
    
    gsub(/^@/, "~@", $0);
    gsub(/( *):( *)/, ":", $0);
    print;                                                                                                                           
    }         

graphviz.awk 文件,内容如下:

imaginemiracle@ubuntu:abstract_syntax_tree$ cat graphviz.awk
#! /usr/bin/gawk -f

BEGIN {
    
    RS = "~@"; printf "digraph G {
    
    \n node [shape = record];";}
/^[0-9]/{
    
    
s = sprintf("%s [label = \"{%s | {", $1, $1);
for(i = 2; i < NF - 1; i++)
    s = s sprintf("%s | ", $i);
    s = s sprintf("%s}}\"];\n", $i);
    $0 = s;
    while (/([a-zA-Z]+):@([0-9]+)/){
    
    
        format = sprintf("\\1 \\3\n %s:\\1 -> \\2;", $1);
        $0 = gensub(/([a-zA-Z]+):@([0-9]+)(.*)$/, format, "g");
    };
    printf " %s\n", $0;
}
END {
    
    print "}"}

再写一个脚本 run.sh 来调用这两个脚本 pre.awktree.awk,其内容如下:

imaginemiracle@ubuntu:abstract_syntax_tree$ cat run.sh 
./pre.awk $1.* | ./graphviz.awk > $1.dot

文件准备好后需要给这个几个脚本赋予可执行权限,命令如下:

imaginemiracle@ubuntu:abstract_syntax_tree$ chmod 755 *.awk
imaginemiracle@ubuntu:abstract_syntax_tree$ chmod 755 run.sh

为避免有的小伙伴没有安装过 gawkgraphviz 导致报错无法执行脚本的慌乱,下面提供两个软件包的安装命令:

imaginemiracle@ubuntu:abstract_syntax_tree$ sudo apt install gawk
imaginemiracle@ubuntu:abstract_syntax_tree$ sudo apt-get install graphviz

开始执行 run.sh 脚本,将会生成 *.dot 文件,这里是 ast.c.dot

imaginemiracle@ubuntu:abstract_syntax_tree$ bash run.sh ast.c
imaginemiracle@ubuntu:abstract_syntax_tree$ ls
a.out  ast.c  ast.c.004t.original  ast.c.dot  pre.awk  run.sh  tree.awk

接着使用 dot 工具将其导出为 PNG 图片或 PDF 格式文件:

imaginemiracle@ubuntu:abstract_syntax_tree$ dot -Tpng ast.c.dot -o ast.png
imaginemiracle@ubuntu:abstract_syntax_tree$ ls
a.out  ast.c  ast.c.004t.original  ast.c.dot  ast.png  pre.awk  run.sh  tree.awk

如下即为对应上面源码导出的抽象语法树(AST)图,可以对比 ast.c.004t.original 文件内容,会发现里面的内容被用更加清晰的框图和箭头的形式描述了出来,使人们更容易解读。虽然源码很简单,如果“没有基础”并且“没有做好准备”的小伙伴们看到与其对应的抽象语法树是不是还是眼花缭乱的。
抽象语法树
没关系,下面笔者就尝试和大家一起学习分析一下这让人眼花缭乱的抽象语法树(AST)。我们类比着 ast.c.004t.original 文件和上面这张语法树图同时分析,这样子我们就两个都会看明白了。

1.3.2. 简单解读AST(Abstract Syntax Tree)

若需要搞清楚抽象语法树里的各个节点也就是上图中的每个框代表什么意思就不得不的从 gcc 的源码入手了,gcc 中使用一个名为 tree_node 的共用体(也称联合体,union)来描述 AST 的每个节点,其实这样做是为了对 AST 实现一个通用的描述名称而已,而在 tree_node 内部添加的各个结构体作为实际的特性存储结构,tree_node 是对这所有的特性存储结构的一个统称。我们来看看在 gcc 中是定义的抽象语法树(AST)里都包含哪些代表不同含义的节点。
[注]:一时没有找到 tree_node 定义位置的读者也不要慌张。新版本的 union tree_node 的定义被写在 ./gcc/tree-core.h中,而旧版本的 tree_node 的定义写在 ./gcc/tree.h里。

//# gcc-12.1.0/gcc/tree-core.h	##------## union tree_node

/* Define the overall contents of a tree node.
   It may be any of the structures declared above
   for various types of node.  */
union GTY ((ptr_alias (union lang_tree_node),
	    desc ("tree_node_structure (&%h)"), variable_size)) tree_node {
    
    
  /* 树节点的基类结构体 */
  struct tree_base GTY ((tag ("TS_BASE"))) base;
  /* 类型节点 */
  struct tree_typed GTY ((tag ("TS_TYPED"))) typed;
  /* 树节点的共有基本信息 */
  struct tree_common GTY ((tag ("TS_COMMON"))) common;
  /* 整型常量节点 */
  struct tree_int_cst GTY ((tag ("TS_INT_CST"))) int_cst;
  struct tree_poly_int_cst GTY ((tag ("TS_POLY_INT_CST"))) poly_int_cst;
  /* 实数常量节点 */
  struct tree_real_cst GTY ((tag ("TS_REAL_CST"))) real_cst;
  /* 定点数常量节点 */
  struct tree_fixed_cst GTY ((tag ("TS_FIXED_CST"))) fixed_cst;
  /* 向量常量节点 */
  struct tree_vector GTY ((tag ("TS_VECTOR"))) vector;
  /* 字符串常量节点 */
  struct tree_string GTY ((tag ("TS_STRING"))) string;
  /* 复数常量节点 */
  struct tree_complex GTY ((tag ("TS_COMPLEX"))) complex;
  /* 标识符节点 */
  struct tree_identifier GTY ((tag ("TS_IDENTIFIER"))) identifier;
  /* 声明的基类 */
  struct tree_decl_minimal GTY((tag ("TS_DECL_MINIMAL"))) decl_minimal;
  struct tree_decl_common GTY ((tag ("TS_DECL_COMMON"))) decl_common;
  /* 具有rtl属性的声明 */
  struct tree_decl_with_rtl GTY ((tag ("TS_DECL_WRTL"))) decl_with_rtl;
  /* 非一般声明的基类 */
  struct tree_decl_non_common  GTY ((tag ("TS_DECL_NON_COMMON")))
    decl_non_common;
  /* 参数声明的基类 */
  struct tree_parm_decl  GTY  ((tag ("TS_PARM_DECL"))) parm_decl;
  /* 具有可见性声明的基类 */
  struct tree_decl_with_vis GTY ((tag ("TS_DECL_WITH_VIS"))) decl_with_vis;
  /* 变量声明 */
  struct tree_var_decl GTY ((tag ("TS_VAR_DECL"))) var_decl;
  /* 字段声明 */
  struct tree_field_decl GTY ((tag ("TS_FIELD_DECL"))) field_decl;
  /* 标签声明节点 */
  struct tree_label_decl GTY ((tag ("TS_LABEL_DECL"))) label_decl;
  /* 返回值声明节点 */
  struct tree_result_decl GTY ((tag ("TS_RESULT_DECL"))) result_decl;
  /* 常量声明节点 */
  struct tree_const_decl GTY ((tag ("TS_CONST_DECL"))) const_decl;
  /* 类型声明节点 */
  struct tree_type_decl GTY ((tag ("TS_TYPE_DECL"))) type_decl;
  /* 函数声明节点 */
  struct tree_function_decl GTY ((tag ("TS_FUNCTION_DECL"))) function_decl;
  /* 翻译单元声明节点 */
  struct tree_translation_unit_decl GTY ((tag ("TS_TRANSLATION_UNIT_DECL")))
    translation_unit_decl;
  struct tree_type_common GTY ((tag ("TS_TYPE_COMMON"))) type_common;
  struct tree_type_with_lang_specific GTY ((tag ("TS_TYPE_WITH_LANG_SPECIFIC")))
    type_with_lang_specific;
  struct tree_type_non_common GTY ((tag ("TS_TYPE_NON_COMMON")))
    type_non_common;
  /* 列表节点 */
  struct tree_list GTY ((tag ("TS_LIST"))) list;
  /* 向量节点 */
  struct tree_vec GTY ((tag ("TS_VEC"))) vec;
  /* 表达式节点 */
  struct tree_exp GTY ((tag ("TS_EXP"))) exp;
  /* 静态单赋值 SSA_NAME 节点 */
  struct tree_ssa_name GTY ((tag ("TS_SSA_NAME"))) ssa_name;
  /* 块信息节点 */
  struct tree_block GTY ((tag ("TS_BLOCK"))) block;
  /* 块信息节点 */
  struct tree_binfo GTY ((tag ("TS_BINFO"))) binfo;
  /* 语句列表节点 */
  struct tree_statement_list GTY ((tag ("TS_STATEMENT_LIST"))) stmt_list;
  struct tree_constructor GTY ((tag ("TS_CONSTRUCTOR"))) constructor;
  struct tree_omp_clause GTY ((tag ("TS_OMP_CLAUSE"))) omp_clause;
  struct tree_optimization_option GTY ((tag ("TS_OPTIMIZATION"))) optimization;
  struct tree_target_option GTY ((tag ("TS_TARGET_OPTION"))) target_option;
};

上面笔者给出了部分的节点含义的注释,我们把本次分析所需要用到的几个节点列在下面,可以更直观的看到。

tree_node中的结构体类型 结构体名 描述
struct tree_identifier GTY ((tag (“TS_IDENTIFIER”))) identifier 标识符节点
struct tree_function_decl GTY ((tag (“TS_FUNCTION_DECL”))) function_decl 函数声明节点
struct tree_var_decl GTY ((tag (“TS_VAR_DECL”))) var_decl 变量声明节点
struct tree_exp GTY ((tag (“TS_EXP”))) exp 表达式节点
struct tree_result_decl GTY ((tag (“TS_RESULT_DECL”))) result_decl 返回值节点

首先回到 ast.c 源码:

int main(int argc, char **argv)
{
    
    
    int a = 10;
    int b = 20;

    int result = 0;

    result = a + b / a;

    return 0;
}

源码很简单,只有一个 main 函数,其中定义了三个 int 型的变量 abresult 并赋了初值,以及一行运算表达式赋值语句 result = a + b / a。从上表中可以看出,源码这里的 abresult 定义以及表达式 result = a + b / a 在抽象语法树就主要用到的树节点保存变量声明的节点 struct tree_var_decl、用于保存表达式的节点 struct tree_exp,清楚这些,就可以去看通过 gcc -fdump-tree-original-raw 生成对应的抽象语法树 ast.c.004t.original

1.3.2.1. AST节点结构

首先我们需要清楚抽象语法树文件 ast.c.004t.original 中一行里每个字段都代表什么意思,每个节点有最近本的三个单元组成,一个节点(一行,由于长度的关系也可能是几行)的基本信息如下。

字段 对应的特性存储结构 描述
标号 (TREE_CODE) enum tree_code 树节点标识 TREE_CODE 的枚举
名称 (NAME) const char * const tree_code_name[] 树节点名称的字符串数组
类型 (TREE_CODE_CLASS) enum tree_code_class
const char * const tree_code_class_strings[]
const enum tree_code_class tree_code_type[]
树几点类型的枚举
树节点类型名称的字符串数组
以 TREE_CODE 为索引的树节点类型数组
操作数长度 (len) const unsigned char tree_code_length[] 树节点操作数的数目

1.3.2.2. AST 的各个节点分析

@1 节点
清楚了以上内容,现在从 ast.c.004t.original
文件的第一个节点来分析它。

@1      statement_list   0   : @2       1   : @3
# 与该节点对应的存储结构为:
# struct tree_statement_list GTY ((tag ("TS_STATEMENT_LIST"))) stmt_list;

可以看到该节点的标号 TREE_CODE@1,也就是说它是第 1 号节点,节点的名称 NAMEstatement_list,由此可以看出该节点对应的语法树节点的特性存储结构是 tree_node.tree_statement_list,该节点描述的是一段语句列表,看的出该语句列表中有两个语句 0 语句和 1 语句,0 语句指向了 @2 节点,而 1 语句指向了 @3 节点。其图示结构如下。
在这里插入图片描述

@2 节点

@2      bind_expr        type: @4       vars: @5       body: @6
# 与该节点对应的存储结构为:
# struct tree_exp GTY ((tag ("TS_EXP"))) exp;

先来看 @1 节点描述的语句列表中的 0 语句,可以看到该节点的标号 TREE_CODE@2,节点名称 NAMEbind_expr,由此可以看出该节点描述的是一个表达式,其对应的特性存储结构是 tree_node.tree_exp,该结构体可以描述更多的表达式类型可以详参源码目录下的 ./gcc/tree.def文件。
首先我们已经明确了该节点描述的是一的表达式,type 字段描述的是该表达式的取值类型,type 指向了 @4 节点,而后面几个描述的是该表达式的操作数,可以看的出一共有两个操作数分别为 vars 指向了 @5 节点和 body 指向了 @6 节点。其图示结构如下。

在这里插入图片描述

我们可以通过查看 ./gcc/tree.def 文件中 bind_expr 节点的操作数的类型:BIND_EXPR

  • BIND_EXPR_VARS: 指向描述变量的节点;
  • BIND_EXPR_BODY: 指向用于计算该变量的表达式函数主体;
  • BIND_EXPR_BLOCK: 该块操作数一般用于调试,这里不做分析。

看到这里我们已经可以将该节点的内容对应起来了。

节点字段 指向的节点 描述
type @4 该表达式的取值类型
vars @5 表达式的操作数,变量(BIND_EXPR_VARS
body @6 表达式的主体(BIND_EXPR_BODY

@4 节点
接下来对该节点的单元一个个的分析,先看 @4 节点,

@4      void_type        name: @8       algn: 8
# 与该节点对应的存储结构为:
# struct tree_typed GTY ((tag ("TS_TYPED"))) typed;

该节点名称 NAMEvoid_type,可以看的出该节点描述的是一个类型(其实已经从名字就看的出描述的是 void 类型了),下面对其简单分析一下。

节点字段 指向的节点 描述
name @8 该类型的名称
algn - 一般该字段指向整型常量节点,此处该值为 8,表示精度为 8bit,按 8bit 对齐

@8 节点 & @21 节点

@8      type_decl        name: @21      type: @4
# 与该节点对应的存储结构为:
# struct tree_type_decl GTY ((tag ("TS_TYPE_DECL"))) type_decl;

再来看 @8 节点,NAMEtype_decl,该节点为一个声明类型的节点,其 name 字段表示,该类型的名称指向 @21 节点,type 字段指向该类型的类型节点,可以又指回了 @4 节点。下面看 @21 节点,可以直观的看到该节点就是一个标识符节点,该标识符的字符名称为 void。其图示结构如下。

# 标识符节点
@21     identifier_node  strg: void     lngt: 4
# 与该节点对应的存储结构为:
# struct tree_identifier GTY ((tag ("TS_IDENTIFIER"))) identifier;

小知识: 这里对标识符节点做一点解释。

identifier_node 字段 描述
strg 以字符串(string)形式表示标识符名
lngt 标识符长度(length

在这里插入图片描述

@5 节点
接着回头分析 @2 节点的下一个字段也就是其第一个操作数 vars: @5

@5      var_decl         name: @9       type: @10      scpe: @11     
                         srcp: ast.c:3                 init: @12     
                         size: @13      algn: 32       used: 1
# 与该节点对应的存储结构为:                         
# struct tree_var_decl GTY ((tag ("TS_VAR_DECL"))) var_decl;                         

小知识: 这里给出对此处的第 @5 节点即 var_decl 节点详细分析的结果,有兴趣的小伙伴可以根据抽象语法树分析,也可以得出该结果。

var_decl字段 描述 @5 节点的值
name 该节点描述的变量名称 a
type 该节点描述的变量类型 int
scpe 所属范围 (scope) main() 函数作用域
srcp 源程序(source progarm 在源码 ast.c 的第 3
init 初始值(initial 初始值为 10
size 变量大小 32bit
algn 对齐位数(align 32bit 对齐
used 引用次数 1

从节点名 var_decl 知道该节点描述的是对一个变量的声明,虽然该节点有很多字段,事实上我们分析一个变量的声明时,常常关心的只有三点内容,即变量的变量名、类型以及初始值,也就是该节点中的 nametypeinit 这三个字段的内容。
@10 节点
按照常规习惯,我们先来看该变量的类型 type: @10,由于对类型的分析在上文已经有做过详细的跟踪查看,此处不做太过详细的解释,分析方法同上即可,下面列出与之相关的几个关键节点信息。

# 类型节点——声明整型类型节点
@10     integer_type     name: @22      size: @13      algn: 32      
                         prec: 32       sign: signed   min : @23     
                         max : @24
# 与该节点对应的存储结构为:
# struct tree_typed GTY ((tag ("TS_TYPED"))) typed;
@22     type_decl        name: @32      type: @10  
# 与该节点对应的存储结构为:
# struct tree_type_decl GTY ((tag ("TS_TYPE_DECL"))) type_decl;   
@32     identifier_node  strg: int      lngt: 3    
# 与该节点对应的存储结构为:
# struct tree_identifier GTY ((tag ("TS_IDENTIFIER"))) identifier;              

小知识: 这里给出对此处的第 @10 节点即 integer_type 节点详细分析的结果,有兴趣的小伙伴可以根据抽象语法树分析,也可以得出该结果。

integer_type字段 描述 @10 节点的值
name 该节点描述的名称 int
size 该节点描述的类型的大小 32bit
algn 对齐位数(align 32bit 对齐
perc 该类型的精度 该类型为 32bit 精度
sign 有无符号 signed 有符号类型
min 最小值 -2^31 = -2147483648
max 最大值 2^31 = 2147483648

从上面几个节点可以看出来该 @5 节点的变量为 int 型变量。下面来看该变量的变量名 name: @9,可以很直观的看出该变量名应为 a

# 标识符节点
@9      identifier_node  strg: a        lngt: 1
# 与该节点对应的存储结构为:
# struct tree_identifier GTY ((tag ("TS_IDENTIFIER"))) identifier;

来看该变量的初始值 init: @12,即第 12 节点。可以很清楚的看到该节点为整型常量的节点 integer_cst,其值为 int: 10。其图示结构如下。

# 整型常量节点
@12     integer_cst      type: @10     int: 10
# 与该节点对应的存储结构为:
# struct tree_int_cst GTY ((tag ("TS_INT_CST"))) int_cst;

在这里插入图片描述

到这里我们就清楚了 @5 节点描述的是一个类型为 int 型,名为 a,初始值为 10的整型变量,也就是源码中的 int a = 10;

# C语言定义
int a = 10;
# ===========================分隔符===========================
# 抽象语法树对其的描述
@5      var_decl         name: @9       type: @10      scpe: @11     
                         srcp: ast.c:3                 init: @12     
                         size: @13      algn: 32       used: 1
@9      identifier_node  strg: a        lngt: 1       
@10     integer_type     name: @22      size: @13      algn: 32      
                         prec: 32       sign: signed   min : @23     
                         max : @24     
@11     function_decl    name: @25      type: @26      srcp: ast.c:1      
                         args: @27      link: extern  
@12     integer_cst      type: @10     int: 10
@13     integer_cst      type: @28     int: 32

@22     type_decl        name: @32      type: @10 

@32     identifier_node  strg: int      lngt: 3

到这里我们分析的内容,笔者在抽象语法树的图中摘了出来,可以直观的梳理分析过的思路。
int a = 10;

@6 节点
继续来分析 @2 节点的下一个操作数 body: @6,可以看得出 @6 节点是描述一段语句的节点,其语句包含节点 0: @141: @152: @163: @174: @18

@6      statement_list   0   : @14      1   : @15      2   : @16     
                         3   : @17      4   : @18
# 与该节点对应的存储结构为:
# struct tree_statement_list GTY ((tag ("TS_STATEMENT_LIST"))) stmt_list;                         

ast.c.004t.original 文件里可以看到节点 @14@15@16 这三个几点是同一类型节点,且类型指向也相同 @4,三个节点均是描述的是一个空的表达式。因此这里这三个节点是没什么实际意义的。

@14     decl_expr        type: @4      
@15     decl_expr        type: @4      
@16     decl_expr        type: @4

@17 节点

@17     modify_expr      type: @10      op 0: @29      op 1: @30
# 与该节点对应的存储结构为:
# struct tree_exp GTY ((tag ("TS_EXP"))) exp;

从该节点名 modify_expr 可以看出该节点描述的是一段赋值表达式,type 字段表示该表达式的取值类型指向节点 @10,由于上文已经对其类似的分析过了,这里就不在赘述,该表达式的取值类型为 int,其它字段含义可以在 ./gcc/tree.def 文件中索引找到,如下。
modify_expr
在该文件中指出,MODIFY_EXPR 表达式有两个操作数,其中 Operand 0 是需要被赋值的操作数,Operand 1 就是将要赋值给 Operand 0 的具体值。用 c 语言表达就相当于是 op_0 = op_1;,这样子的一条简单的赋值语句。图示结构如下。
在这里插入图片描述

@29 节点

@29     var_decl         name: @40      type: @10      scpe: @11     
                         srcp: ast.c:6                 init: @20     
                         size: @13      algn: 32       used: 1 
# 与该节点对应的存储结构为:
# struct tree_var_decl GTY ((tag ("TS_VAR_DECL"))) var_decl;                          

那么首先来看 op 0: @29。该节点 NAMEvar_decl,可以看出它是描述一个变量的节点,其分析方法可参考上文 @5 节点的分析过程,这里不再赘述,直接给出分析结果。@29 节点描述的是一个名为 result,类型 int 型,初始值为 0 的一个整型变量。即 ast.c 源码中对 result 变量的定义, int result = 0;
@30 节点

@30     plus_expr        type: @10      op 0: @41      op 1: @5 
# 与该节点对应的存储结构为:
# struct tree_exp GTY ((tag ("TS_EXP"))) exp;

从该节点名 plus_expr 可以看出该节点描述的是一段简单的加法运算表达式,type 字段同样描述的是该表达式的取值类型为 int,其后跟的两个操作数为两个加数,即 op 0op 1 需要相加(op_0 + op_1)。首先来看 op 0: @41
@41 节点

@41     trunc_div_expr   type: @10      op 0: @43      op 1: @5 
# 与该节点对应的存储结构为:
# struct tree_exp GTY ((tag ("TS_EXP"))) exp;

该节点名为 trunc_div_expr,是描述一段整数的除法运算(即只保留商的整数结果的除法运算),其中 op 0 为被除数,op 1 为除数 (op_0 / op_1)。其图示结构如下。
在这里插入图片描述
@43 节点

@43     var_decl         name: @46      type: @10      scpe: @11     
                         srcp: ast.c:4                 init: @47     
                         size: @13      algn: 32       used: 1  
# 与该节点对应的存储结构为:
# struct tree_var_decl GTY ((tag ("TS_VAR_DECL"))) var_decl;                          

@41 节点的 op0 指向的节点 @43 节点名 NAMEvar_decl,同样是描述一个变量定义,同 @5 节点的分析过程一样此处直接给出分析结果,该节点事实上是描述的源码 ast.c 中的 b 变量的定义以及初始化,int b = 20;
@41 节点的 op0 指向的节点 @5,我们已经分析过了,是int a = 10;,因此 @41 节点描述的表达式即为 (int) b / (int) a;@30 节点的 op 0 的值为 (int)20 / (int)10 = (int)2
@30 节点的 op 1 指向 @5 节点,因此 @30 节点描述的表达是则为 (op_0 + op_1) == ((int)2 + (int)10),因此 @30 节点的值为 (int) 12
到这里我们就清楚了 @17 节点表达式描述的是,(int) result = (int) 20 / (int) 10 + (int) 10 = 12;,此时 result 变量应被赋值为 12
@18 节点 && @31 节点
而这时候 @6 节点描述的语句列表也被我们分析的只剩 @18 节点。该节点较为简单,下面将其有关的几个节点均列出来。

@18     return_expr      type: @4       expr: @31 
# 与该节点对应的存储结构为:
# struct tree_exp GTY ((tag ("TS_EXP"))) exp;
@31     modify_expr      type: @10      op 0: @19      op 1: @20
# 与该节点对应的存储结构为:
# struct tree_exp GTY ((tag ("TS_EXP"))) exp;
@19     result_decl      type: @10      scpe: @11      srcp: ast.c:1      
                         note: artificial              size: @13     
                         algn: 32
# 与该节点对应的存储结构为:                         
# struct tree_result_decl GTY ((tag ("TS_RESULT_DECL"))) result_decl;
@20     integer_cst      type: @10     int: 0
# 与该节点对应的存储结构为: 
# struct tree_int_cst GTY ((tag ("TS_INT_CST"))) int_cst; 

经过上面对一系列的节点分析,相信才华横溢的大家已经可以很轻松的读懂该节点的内容,首先该节点描述的是取值类型为 @4 节点即 void 类型,也就是该表达式无返回值,表达式主体指向的是 @31 节点,一段赋值一句,将一个整型常量 0 赋值给定义的函数返回值,即表示源码 ast.c 中的最后一条语句 return 0;,其图示结构如下。
return 0;
@3 节点
现在来观察第三个节点 return_expr

@3      return_expr      type: @4       expr: @7 
# 与该节点对应的存储结构为:               
# struct tree_exp GTY ((tag ("TS_EXP"))) exp;

可以看的出来该节点与我们刚刚分析过的 @18 节点所描述的表达式是完全相同的,只是对应节点不同而已,其含义同样是 return 0;,其描述的事实上是 int main() 函数的默认返回值是 0,有好奇心的小伙伴可以将 ast.c 中最后一条 return 0; 改为 return 666; (数字随意),再生成抽象语法树 AST 查看就会更加清楚了。下面笔者附上修改后生成 AST 文件,作为对比学习。

// 修改后的 ast.c
imaginemiracle@ubuntu:test$ cat ast.c
int main(int argc, char **argv)
{
    
    
    int a = 10;
    int b = 20;

    int result = 0;

    result = a + b / a;

    return 666;
}

这里只附上关键几个节点信息,足够用来明确两个 return_expr 节点分别描述的内容。

// 修改后的 ast.c.004t.original
@1      statement_list   0   : @2       1   : @3      
@2      bind_expr        type: @4       vars: @5       body: @6      
@3      return_expr      type: @4       expr: @7      

@6      statement_list   0   : @14      1   : @15      2   : @16     
                         3   : @17      4   : @18  
@7      modify_expr      type: @10      op 0: @19      op 1: @20 

@18     return_expr      type: @4       expr: @31 
@19     result_decl      type: @10      scpe: @11      srcp: test.c:1      
                         note: artificial              size: @13     
                         algn: 32      
@20     integer_cst      type: @10     int: 0

@31     modify_expr      type: @10      op 0: @19      op 1: @42

@42     integer_cst      type: @10     int: 666

到此整个 ast.c 源码的代码主体部分已经完全分析完成,只剩下一个 main 函数的声明节点 @ 26 未作分析,事实上对该节点分析并不难,感兴趣的小伙伴可以自行进行分析,参考源码和 gcc 目录下 ./gcc/tree-core.h中定义的 tree_node,以及 ./gcc/tree.def 文件来分析。综合以上分析的内容以及 main 函数的定义节点即构成下面这副完整的 ast.c 源码的抽象语法树(AST)图示结果。
在这里插入图片描述

1.4. 中间代码生成 (GIMPLE)

在语法&语义分析后的下一个步骤就是生成中间代码,语法分析阶段输出的表达式或程序语句是以语法树 (AST/GENERIC) 的形式存储,编译器并无法直接将其编译生成汇编程序,此时需要将其转换为一种中间代码。中间代码 (GIMPLE) 是编译过程中的一种临时代码,中间代码是 gcc 为了处理不同的前端语言而引入的一种与平台无关的代码。
gcc 中将完成对抽象语法树 (AST/GENERIC) 转换为中间代码 (GIMPLE) 这一过程命名为 passgcc 就是利用 pass 这一个抽象的名词代表了这一转换过程中的各种具体方法。在 gcc-12.1.0 中将 pass 的核心结构定义在了 gcc-12.1.0/gcc/tree-pass.h 中的 class opt_pass类中。

/* An instance of a pass.  This is also "pass_data" to minimize the
   changes in existing code.  */
class opt_pass : public pass_data
{
    
    
public:
  virtual ~opt_pass () {
    
     }

  /* Create a copy of this pass.

     Passes that can have multiple instances must provide their own
     implementation of this, to ensure that any sharing of state between
     this instance and the copy is "wired up" correctly.

     The default implementation prints an error message and aborts.  */
  virtual opt_pass *clone ();
  virtual void set_pass_param (unsigned int, bool);

  /* This pass and all sub-passes are executed only if the function returns
     true.  The default implementation returns true.  */
  virtual bool gate (function *fun);

  /* This is the code to run.  If this is not overridden, then there should
     be sub-passes otherwise this pass does nothing.
     The return value contains TODOs to execute in addition to those in
     TODO_flags_finish.   */
  virtual unsigned int execute (function *fun);

protected:
  opt_pass (const pass_data&, gcc::context *);

public:
  /* A list of sub-passes to run, dependent on gate predicate.  */
  opt_pass *sub;

  /* Next in the list of passes to run, independent of gate predicate.  */
  opt_pass *next;

  /* Static pass number, used as a fragment of the dump file name.  */
  int static_pass_number;

protected:
  gcc::context *m_ctxt;
};

为了说明问题,我们为 ast.c 代码增加一点内容,下面我们看一下 ast.c 生成的语法树 (AST/GENERIC),即上文看到 ast.c.004t.original 文件([注]:此处的该文件是语法树的 GENERIC 表达形式),和所生成的中间代码 (GIMPLE),此处的中间代码形式为三地址码形式。
添加内容后的 ast.c 文件:

imaginemiracle:ast$ cat ast.c 
int main(int argc, char **argv)
{
    
    
    int a = 10;
    int b = 20;

    int result = 0;

    result = a + b / a;

    if (result > (a + b)) {
    
    
        result -= 1;
    } else {
    
    
        result += 1;
    }

    return 0;
}

1.4.1. 语法树的GENERIC形式

ast.c 生成的语法树 (AST/GENERIC),GENERIC 表达形式:

imaginemiracle:ast$ gcc -fdump-tree-original-all ast.c 
imaginemiracle:ast$ ls
ast.c.004t.original a.out  ast.c
imaginemiracle:ast$ cat ast.c.004t.original 

;; Function main (null)
;; enabled by -tree-original


{
    
    
  int a = 10;
  int b = 20;
  int result = 0;

    int a = 10;
    int b = 20;
    int result = 0;
  result = b / a + a;
  if (a + b < result)
    {
    
    
      result = result + -1;
    }
  else
    {
    
    
      result = result + 1;
    }
  return 0;
}
return 0;

1.4.2. 中间代码 (GIMPLE)——三地址码形式

ast.c 生成的中间代码——三地址码形式:

imaginemiracle:ast$ gcc -fdump-tree-gimple ast.c 
imaginemiracle:ast$ ls
ast.c.004t.original  ast.c.005t.gimple  a.out  ast.c
imaginemiracle:ast$ cat ast.c.005t.gimple 
int main (int argc, char * * argv)
{
    
    
 int D.1953;

 {
    
    
   int a;
   int b;
   int result;

   a = 10;
   b = 20;
   result = 0;
   _1 = b / a;
   result = a + _1;
   _2 = a + b;
   if (result > _2) goto <D.1950>; else goto <D.1951>;
   <D.1950>:
   result = result + -1;
   goto <D.1952>;
   <D.1951>:
   result = result + 1;
   <D.1952>:
   D.1953 = 0;
   return D.1953;
 }
 D.1953 = 0;
 return D.1953;
}

1.4.3. AST/GENERIC 与 GIMPLE 的区别

中间代码 (GIMPLE) 和语法树 (AST/GENERIC) 的区别(也可以说是优点):

  • 与语言无关的中间表示
      语法树 (AST) 形式与前端的编程语言是高度相关的,即每种语言通过对应的词法&语法分析后生成的 AST 是异构的(即不同语言生成的语法树结构是不同的,如Java、C++两种语言即便描述的是同一个表达式、语句或者是同功能的代码,但使用其对应的编译器所生成的语法树也是不同的,即异构)。而中间代码 (GIMPLE) 表示形式则是与语言无关的,任何语言再转换成中间代码 (GIMPLE) 形式时,都需遵从 GIMPLE 规范,从而最终将转换为同构的中间代码。
  • 具有线性序列特征
      语法树 (AST) 是树形表达结构,难以进行编译处理([注]: 其树形结构只是难处理的一个原因,还包括语法树本身的异构性等原因并不能直接对语法树直接编译生成汇编代码),而中间代码 (GIMPLE) 本质上是一种线性的表达序列,编译器能够更加方便、高效做后续的编译处理,即编译成目标代码。
    [注]:虽然中间代码是一种线性表达序列,但依然会使用树形结构来描述表达式中的操作符、操作数等元素。但其宏观上讲还是一种线性结构,可以说是利用了树节点的便利性。

1.4.4. GIMPLE 的特点

通过上文的对比中间代码 (GIMPLE) 是编译器更易于处理的一种具有同构性的一种临时代码,常见的中间代码有 “三地址码”、“P-代码”等。

  • GIMPLE 引入临时变量保存中间结果,将 AST 表达式拆分成不超过三个操作数的元组(Tuples);
  • 由于 GIMPLE 形式在本质上是线性的代码序列,所有的计算表达式都表示成一系列的基本操作。AST 中的控制结构(如,if-elseforwhile等)在 GIMPLE 表示中都被转换成条件跳转语句(使用 goto 完成);
  • AST 中的词法作用域 (Lexical Scopes) 在低级 GIMPLE 中的被取消。

1.5. 汇编代码生成

1.6. 目标代码生成

2. 链接阶段

3. 重定位阶段

猜你喜欢

转载自blog.csdn.net/qq_36393978/article/details/124604885