(2) Lua源码系列----字符串的源码

Lua 的字符串 #

Lua 版本 5.3.4

1 字符串的数据结构

1.1 字符串分类

从 5.2.0版本开始,Lua 开始区分长字符串和短字符串,“长短”长度的标准定义在 llimits.h
#define LUAI_MAXSHORTLEN 40

“长短” 类型的定义在 lobject.h

/* Variant tags for strings */
#define LUA_TSHRSTR (LUA_TSTRING | (0 << 4))  /* short strings */
#define LUA_TLNGSTR (LUA_TSTRING | (1 << 4))  /* long strings */

1.2 字符串的结构

/*
** Header for string value; string bytes follow the end of this structure
** (aligned according to 'UTString'; see next).
** **字符串的头部,字符串的真正内容在这个结构后面**
*/
typedef struct TString {
  CommonHeader;
  lu_byte extra;  /* reserved words for short strings; "has hash" for longs */
                  /* 对于短字符串:这个标示是否是保留字,长字符串:是否已经哈希① */
  lu_byte shrlen;  /* 短字符串的长度 */
  unsigned int hash;
  union {
    size_t lnglen;  /* 长字符串的长度 */
    struct TString *hnext;  /* 短字符串:linked list for hash table */
  } u;
} TString;


/*
** Ensures that address after this type is always fully aligned.
*/
typedef union UTString {
  L_Umaxalign dummy;  /* 内存对齐 */
  TString tsv;
} UTString;

补充1: 字符串申请内存大小
#define sizelstring(l) (sizeof(union UTString) + ((l) + 1) * sizeof(char))

说明①: 长字符串是惰性求hast值

unsigned int luaS_hashlongstr (TString *ts) {
  lua_assert(ts->tt == LUA_TLNGSTR);
  if (ts->extra == 0) {  /* no hash? */
    ts->hash = luaS_hash(getstr(ts), ts->u.lnglen, ts->hash);
    ts->extra = 1;  /* now it has its hash */
  }
  return ts->hash;
}

2.Lua hash 的计算

在lua 5.2.0 之前,lua的哈希计算十分简单(在 lstring.c luaS_newlstr实现),很容易根据其过程构造大量hash值一样的,但是各自内容不相同的字符串。这容易造成hash dos。(导致字符串哈希表拥有一个很大的链表,查找,修改数据性能下降)
接下来说说5.3.4版本的hash 计算函数。后续版本 为了解决这一安全因素加入了随机种子。

2.2.1 字符串随机种子的实现

全局表中的随机种子: g->seed = makeseed(L);

#define luai_makeseed()     cast(unsigned int, time(NULL))

/*
** Compute an initial seed as random as possible. Rely on Address Space
** Layout Randomization (if present) to increase randomness..
** 利用了时间和空间地址的随机性设计的种子
*/
#define addbuff(b,p,e) \
  { 
    size_t t = cast(size_t, e); \
    memcpy(b + p, &t, sizeof(t)); 
    p += sizeof(t); 
  }

static unsigned int makeseed (lua_State *L) {
  char buff[4 * sizeof(size_t)];
  unsigned int h = luai_makeseed();
  int p = 0;
  addbuff(buff, p, L);  /* heap variable */
  addbuff(buff, p, &h);  /* local variable */
  addbuff(buff, p, luaO_nilobject);  /* global variable */
  addbuff(buff, p, &lua_newstate);  /* public function */
  lua_assert(p == sizeof(buff));
  return luaS_hash(buff, p, h);
}

2.2.2 字符串hash值的计算

unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
  unsigned int h = seed ^ cast(unsigned int, l);
  size_t step = (l >> LUAI_HASHLIMIT) + 1;    // LUAI_HASHLIMIT 默认为5
  for (; l >= step; l -= step)
  h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
  return h;
}

3.字符串的创建

/*
** new string (with explicit length)
** 从下面代码我们可以看出创建字符串根据长度分两种
*/
TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
  if (l <= LUAI_MAXSHORTLEN)  /* short string? */
    return internshrstr(L, str, l);
  else {
    TString *ts;
    if (l >= (MAX_SIZE - sizeof(TString))/sizeof(char))
      luaM_toobig(L);
    ts = luaS_createlngstrobj(L, l);
    memcpy(getstr(ts), str, l * sizeof(char));
    return ts;
  }
}

下面是创建新的长短字符串对象的函数。

/*
** creates a new string object
*/
static TString *createstrobj (lua_State *L, size_t l, int tag, unsigned int h) {
  TString *ts;
  GCObject *o;
  size_t totalsize;  /* total size of TString object */
  totalsize = sizelstring(l);
  o = luaC_newobj(L, tag, totalsize);
  ts = gco2ts(o);
  ts->hash = h;
  ts->extra = 0;
  getstr(ts)[l] = '\0';  /* ending 0 */
  return ts;
}

补充1: 短字符串的内部化
短字符串放在全局的字符创表中
global_State->strt;

其中strt结构如下:
typedef struct stringtable {
  TString **hash;
  int nuse;  /* number of elements */
  int size;
} stringtable;

---------------------  internshrstr  ----------------------------
/*
** 先检查是否已经有这样的字符串存在,如果存在,重复利用,否则新建
*/
static TString *internshrstr (lua_State *L, const char *str, size_t l) {
  TString *ts;
  global_State *g = G(L);
  unsigned int h = luaS_hash(str, l, g->seed);
  TString **list = &g->strt.hash[lmod(h, g->strt.size)];
  lua_assert(str != NULL);  /* otherwise 'memcmp'/'memcpy' are undefined */
  for (ts = *list; ts != NULL; ts = ts->u.hnext) {
    if (l == ts->shrlen &&
        (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
      /* found! */
      if (isdead(g, ts))  /* dead (but not collected yet)? */
        changewhite(ts);  /* resurrect it */
      return ts;
    }
  }
  if (g->strt.nuse >= g->strt.size && g->strt.size <= MAX_INT/2) {
    luaS_resize(L, g->strt.size * 2);
    list = &g->strt.hash[lmod(h, g->strt.size)];  /* recompute with new size */
  }
  ts = createstrobj(L, l, LUA_TSHRSTR, h);
  memcpy(getstr(ts), str, l * sizeof(char));
  ts->shrlen = cast_byte(l);
  ts->u.hnext = *list;
  *list = ts;
  g->strt.nuse++;
  return ts;
}

4. 字符串的比较

在已知上面的基础上,思考:如果自己实现Lua字符串的比较,会怎么去实现?
短字符串如何比较相等,长字符串呢?

4.1 短字符串

相同的短字符串在Lua中只会存在一份,那么我们直接根据二者的物理地址比较就可以,非常高效。
#define eqshrstr(a,b) check_exp((a)->tt == LUA_TSHRSTR, (a) == (b))

4.2 长字符串

int luaS_eqlngstr (TString *a, TString *b) {
  size_t len = a->u.lnglen;
  lua_assert(a->tt == LUA_TLNGSTR && b->tt == LUA_TLNGSTR);
  return (a == b) ||  /* same instance or... */
    ((len == b->u.lnglen) &&  /* equal length and ... */
     (memcmp(getstr(a), getstr(b), len) == 0));  /* equal contents */
}

5. 字符串表的调整 ##

// newsize 总是2的指数
void luaS_resize (lua_State *L, int newsize) {
  int i;
  stringtable *tb = &G(L)->strt;
  if (newsize > tb->size) {  /* grow table if needed */
    luaM_reallocvector(L, tb->hash, tb->size, newsize, TString *);
    for (i = tb->size; i < newsize; i++)
      tb->hash[i] = NULL;
  }
  for (i = 0; i < tb->size; i++) {  /* rehash */
    TString *p = tb->hash[i];
    tb->hash[i] = NULL;
    while (p) {  /* for each node in the list */
      TString *hnext = p->u.hnext;  /* save next */
      unsigned int h = lmod(p->hash, newsize);  /* new position */
      p->u.hnext = tb->hash[h];  /* chain it */
      tb->hash[h] = p;
      p = hnext;
    }
  }
  if (newsize < tb->size) {  /* shrink table if needed */
    /* vanishing slice should be empty */
    lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL);
    luaM_reallocvector(L, tb->hash, tb->size, newsize, TString *);
  }
  tb->size = newsize;
}

Lua源码中调用这个函数的有三个地方:1. GC的时候,2.增加短字符串 3.初始化Lua字符串环境(luaS_init)

参考资料

<1> Lua 源码 5.3.4

<2> 云峰 https://www.codingnow.com/temp/readinglua.pdf

猜你喜欢

转载自blog.csdn.net/l101606022/article/details/79007550