1. 概览

vocab_count.c的功能就是生成词典。它的输入是整个语料，它的输出是词典。词典的形式是单词以及单词在语料中出现的次数（如下表）。

//vocab.txt
the 1061396
of 593677
and 14567

输出的键值对是按照频数从高到低排好序的。由于C语言中没有dict这个的现成的数据结构，需要用C语言自己写一个dict。

2. 源码分析

首先看一下GloVe是如何存储单词的

typedef struct vocabulary {  //单词的一个单元
    char *word;
    long long count;
} VOCAB;

typedef struct hashrec {    //用于存储上述单元的哈希表
    char *word;
    long long count;
    struct hashrec *next;
} HASHREC;

这里存了单词的字符串和频数。和word2vec类似，在这份儿代码中一样需要解决哈希冲突的问题。GloVe中采取的策略是使用链表。相同哈希的单词会通过链表串联起来。所以有了HASHREC这个类型。下面是几个比较函数，用于词典中单词排序用的。

/* 实现字符串比较 */
int scmp( char *s1, char *s2 ) {
    while (*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
    return *s1 - *s2;
}


/* 频次高者在前，同频次的话，字典序小的在前 */
int CompareVocabTie(const void *a, const void *b) {
    long long c;
    if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
    else return (scmp(((VOCAB *) a)->word,((VOCAB *) b)->word));
    
}

/* 仅按凭此排序（不确定排序，不能明确排列同频次word） */
int CompareVocab(const void *a, const void *b) {
    long long c;
    if ( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
    else return 0;
}

返回单词的哈希值

/* Simple bitwise hash function */
unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
    char c;
    unsigned int h;
    h = seed;
    for ( ; (c = *word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2));
    return (unsigned int)((h & 0x7fffffff) % tsize);
}

初始化hash表
HASHREC **ht实现了dict的功能，能通过单词的哈希值快速的检索到单词。它是一个链表的数组，数组的大小是TSIZE，是哈希值的上限。要在这个链表数组中查找到想要的单词，首先要计算单词的哈希值，比如是15，然后再ht[15]这个链表中找到单词。

/* Create hash table, initialise pointers to NULL */
HASHREC ** inithashtable() {
    int i;
    HASHREC **ht;
    ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );
    for (i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;  //这里的指空需要留意
    return ht;
}

hash表的插入算法

/* Search hash table for given string, insert if not found */
void hashinsert(HASHREC **ht, char *w) {
    HASHREC     *htmp, *hprv;
    unsigned int hval = HASHFN(w, TSIZE, SEED);
    
    for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next); 
    //上面这个循环的目的是找到合适的插入位置（htmp），找到的标志是htmp只到列表尾部或是找到word值相同的位置
    // hpre记录htmp的前一个位置，方便列表的插入
    if (htmp == NULL) {
        htmp = (HASHREC *) malloc( sizeof(HASHREC) );
        htmp->word = (char *) malloc( strlen(w) + 1 );
        strcpy(htmp->word, w);
        htmp->count = 1;
        htmp->next = NULL;
        if ( hprv==NULL )
            ht[hval] = htmp;
        else
            hprv->next = htmp;
    }
    else {
        /* new records are not moved to front */
        htmp->count++;
        if (hprv != NULL) {  //这是一个提速的小trick，将最近插入的单词提到列表头（思想大概是相同的词可能近期连续出现）
            /* move to front on access */
            hprv->next = htmp->next;
            htmp->next = ht[hval];
            ht[hval] = htmp;
        }
    }
    return;
}

下面的函数实现了从文件中依次读取单个word

int get_word(char *word, FILE *fin) {
    int i = 0, ch;
    for ( ; ; ) {
        ch = fgetc(fin);
        if (ch == '\r') continue;
        if (i == 0 && ((ch == '\n') || (ch == EOF))) {
            word[i] = 0;
            return 1;
        }
        if (i == 0 && ((ch == ' ') || (ch == '\t'))) continue; // skip leading space
        if ((ch == EOF) || (ch == ' ') || (ch == '\t') || (ch == '\n')) {
            if (ch == '\n') ungetc(ch, fin); // 把已经读出来的'\n'压回输入流中
            break;
        }
        if (i < MAX_STRING_LENGTH - 1)
          word[i++] = ch; // don't allow words to exceed MAX_STRING_LENGTH
    }
    word[i] = 0; //null terminate
    // avoid truncation destroying a multibyte UTF-8 char except if only thing on line (so the i > x tests won't overwrite word[0])
    // see https://en.wikipedia.org/wiki/UTF-8#Description
    if (i == MAX_STRING_LENGTH - 1 && (word[i-1] & 0x80) == 0x80) {
        if ((word[i-1] & 0xC0) == 0xC0) {
            word[i-1] = '\0';
        } else if (i > 2 && (word[i-2] & 0xE0) == 0xE0) {
            word[i-2] = '\0';
        } else if (i > 3 && (word[i-3] & 0xF8) == 0xF0) {
            word[i-3] = '\0';
        }
    }
    return 0;
}

主要处理函数，对形成的hash表中的键值对进行排序并输出到文件

int get_counts() {
    long long i = 0, j = 0, vocab_size = 12500;
    // char format[20];
    char str[MAX_STRING_LENGTH + 1];
    HASHREC **vocab_hash = inithashtable();
    HASHREC *htmp;
    VOCAB *vocab;
    FILE *fid = stdin;   // 从标准文件读，因为程序运行采用命令行模式，并将文件重定向到标准输入
    
    fprintf(stderr, "BUILDING VOCABULARY\n");
    if (verbose > 1) fprintf(stderr, "Processed %lld tokens.", i);
    while ( ! feof(fid)) {
        // Insert all tokens into hashtable
        int nl = get_word(str, fid);
        if (nl) continue; // just a newline marker or feof
        if (strcmp(str, "<unk>") == 0) {
            fprintf(stderr, "\nError, <unk> vector found in corpus.\nPlease remove <unk>s from your corpus (e.g. cat text8 | sed -e 's/<unk>/<raw_unk>/g' > text8.new)");
            return 1;
        }
        hashinsert(vocab_hash, str);
        if (((++i)%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[11G%lld tokens.", i);
    }
    if (verbose > 1) fprintf(stderr, "\033[0GProcessed %lld tokens.\n", i);
    vocab = malloc(sizeof(VOCAB) * vocab_size); //生成用于排序的数组
    for (i = 0; i < TSIZE; i++) { // Migrate vocab to array
        htmp = vocab_hash[i];
        while (htmp != NULL) {
            vocab[j].word = htmp->word;
            vocab[j].count = htmp->count;
            j++;
            if (j>=vocab_size) {
                vocab_size += 2500;
                vocab = (VOCAB *)realloc(vocab, sizeof(VOCAB) * vocab_size);  //空间不够时，对数组扩充
            }
            htmp = htmp->next;
        }
    }
    if (verbose > 1) fprintf(stderr, "Counted %lld unique words.\n", j);
    if (max_vocab > 0 && max_vocab < j) //实际的词汇量超过最大规定的词汇量的时，先对所有词汇按频次排序，再用max_vocab去截断后，再按字典序排列
        qsort(vocab, j, sizeof(VOCAB), CompareVocab);
    else max_vocab = j;
    qsort(vocab, max_vocab, sizeof(VOCAB), CompareVocabTie); //After (possibly) truncating, sort (possibly again), breaking ties alphabetically
    
    for (i = 0; i < max_vocab; i++) {
        if (vocab[i].count < min_count) { // If a minimum frequency cutoff exists, truncate vocabulary
            if (verbose > 0) fprintf(stderr, "Truncating vocabulary at min count %lld.\n",min_count);
            break;
        }
        printf("%s %lld\n",vocab[i].word,vocab[i].count);
    }
    
    if (i == max_vocab && max_vocab < j) if (verbose > 0) fprintf(stderr, "Truncating vocabulary at size %lld.\n", max_vocab);
    fprintf(stderr, "Using vocabulary of size %lld.\n\n", i);
    return 0;
}

int find_arg(char *str, int argc, char **argv) {  //获取关键子参数在命令行中的位置
    int i;
    for (i = 1; i < argc; i++) {
        if (!scmp(str, argv[i])) {
            if (i == argc - 1) {   // 如果关键子参数后面没有指定值则报错
                printf("No argument given for %s\n", str);
                exit(1);
            }
            return i;
        }
    }
    return -1;
}

int main(int argc, char **argv) {
    int i;
    if (argc == 1) {
        printf("Simple tool to extract unigram counts\n");
        printf("Author: Jeffrey Pennington ([email protected])\n\n");
        printf("Usage options:\n");
        printf("\t-verbose <int>\n");
        printf("\t\tSet verbosity: 0, 1, or 2 (default)\n");
        printf("\t-max-vocab <int>\n");
        printf("\t\tUpper bound on vocabulary size, i.e. keep the <int> most frequent words. The minimum frequency words are randomly sampled so as to obtain an even distribution over the alphabet.\n");
        printf("\t-min-count <int>\n");
        printf("\t\tLower limit such that words which occur fewer than <int> times are discarded.\n");
        printf("\nExample usage:\n");
        printf("./vocab_count -verbose 2 -max-vocab 100000 -min-count 10 < corpus.txt > vocab.txt\n");
        return 0;
    }
    
    if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
    if ((i = find_arg((char *)"-max-vocab", argc, argv)) > 0) max_vocab = atoll(argv[i + 1]);
    if ((i = find_arg((char *)"-min-count", argc, argv)) > 0) min_count = atoll(argv[i + 1]);
    return get_counts();
}

3. 使用

./vocab_count  -min-count  5  -verbose 2   <  test.txt  > vocab.txt

-min-count : 字符出现的最小次数
-verbose 2 ：冗余输出（会在程序执行过程中输出一些辅助信息，以判断程序执行的进度）
-max-vocab ：这里没有指定，默认0，代表不限制词汇量
< : 是将text文件重定向到给程序的标准输入
> : 将程序汇总的print重定向的输出文件

4. 阅读收获

见识到了C程序获取命令行参数的一种新的方法
对hash表的冲突解决有了更深的认识
使用max-vocab参数时，截断排序的方法还是很巧妙的
对重定向的使用

5. 改进

因为这里最后生成的仅仅是一个已经排序好的字典文件，一是单线程跑速度太慢，能否将其改为多线程跑。或者是使用map-reduce的思想对统计进行改进，然后再使用类似于归并排序的方法进行排序输出。

参考：https://me.csdn.net/u011793737

（1.2）GloVe源码解析——vocab_count.c

1. 概览

2. 源码分析

3. 使用

4. 阅读收获

5. 改进

猜你喜欢