用unorderedmap实现词频统计

前言：

软工第一次作业是实现对文件夹中文件的词频统计，具体要求在博客中。

题目:软工第一次作业

需求：

1. 统计文件的字符数

2. 统计文件的单词总数

3. 统计文件的总行数

4. 统计文件中各单词的出现次数

5. 对给定文件夹及其递归子文件夹下的所有文件进行统计

6. 统计两个单词（词组）在一起的频率，输出频率最高的前10个

PSP：

PSP表格
PSP	Personal Software Process Stages	Time /h
Planning	计划	20
Estimate	估计这个任务需要多少时间	20
Development	开发	52
Analysis	需求分析 (包括学习新技术)	2
Design Spec	生成设计文档	1
Design	具体设计	4
Coding	具体编码	13
Code Review	代码复审	2
Test	测试（自我测试，修改代码，提交修改）	30
Reporting	报告	10
Test Report	测试报告	6
Size Measurement	计算工作量	2
Postmortem & Process Improvement Plan	事后总结, 并提出过程改进计划	2

思路：

一开始准备写个字典树，字典树的搜索效率是O(n)。转念一想，为什么不用Hash呢，Hash的查找只需要O(1)，于是准备一鼓作气写个hash。Google如何写一个perfect的hash的时候，某位大佬提到了unordered_map。（在这感谢这位不愿意透露姓名的赵r大佬）。

然后Google Unordered_map 看到最全面的一篇blog就是这个，博主讲的很详细。

又恶补了一通map的用法，详情戳这里。

好了开始动工~

class word_time {
public:
	string word; //最后输出的按照字典顺序的字母
	int time;    //单词出现的次数
public:
	word_time(){   //构造函数，init
		this->word = "";
		this->time = 0;
	}
};

unordered_map<string, word_time> word_list;   //从“一个单词”到“单词最简形式和出现次数构成的类”的一个字典

主要思想就是这样了，废话不多说，直接上代码~

代码：

#include <io.h>
#include <iostream>
#include <unordered_map>
#include <string>
#include <cctype>
#include <algorithm>
#include <fstream>
using namespace std;


class word_time {
public:
	string word;
	int time;
public:
	word_time(){
		this->word = "";
		this->time = 0;
	}
};

class word_word_time : public word_time {
public:
	string word_s;
	word_word_time() {
		this->time = 0;
		this->word = "";
		this->word_s = "";
	}
	void operator=(const word_word_time &another) {
		this->time = another.time;
		this->word = another.word;
		this->word_s = another.word_s;
	}
};

class bi_word {
public:
	string str1;
	string str2;
	bi_word() {
		this->str1 = "";
		this->str2 = "";
	}

	bi_word(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}

	bool operator==(const bi_word &another) {
		if (this->str1 == another.str1&&this->str2 == another.str2)
			return true;
		return false;
	}

	void operator=(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}
};

unordered_map<string, word_time> word_list;
unordered_map<string, word_word_time> bi_word_list;

/*
判断一个char是不是字母
参数类型: char
*/
bool is_letter(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122)
		return true;
	return false;
}
/*
判断一个char是不是分隔符
*/
bool is_fengefu(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57)
		return false;
	return true;
}

/*
添加一个字母到word_list中，并统计词数
参数类型： string 
*/
void add_a_word(string word) {
	if (!is_letter(word[0]))
		return;                                                            //如果word[0]不是字母就return
	string word_ = word;
	string::iterator it;
	word_time word__time;
	it = word.end();
	it--;
	while (!is_letter(*it)) {
		it--;
	};																		//*it不是字母
	word.erase(it+1, word.end());											//截取前面一部分
	/*for (it = word.begin(); it - word.begin() < 4; it++) {
		if (!is_letter(*it)) return;
	}*/																		//如果it前四位不是纯字母，直接
	transform(word.begin(), word.end(), word.begin(), ::toupper);           //转换为大写
	//word_time one = word_list[word];
	word_list[word].time++;													//把化简后的word塞入word_list并++次数
	if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) {
		word_list[word].word = word_;
	}															//如果word_比原来的小 就更新

}

/*
统计一行字符数
参数类型:string
*/
int count_char_sum(string str) {
	return(str.length());
}

/*
声明一下add_a_bi_word函数
*/
void add_a_bi_word(bi_word b_word);


/*
将一行的单词输入进word_list,并生成n-1个词组,并将这n-1个词组输入进bi_word_list（其实是个map）
*/
int sum=0;
string str_temp,str_now;
void insert_into_wordlist(string &line) {
	vector<vector<string>> wordlist_of_a_line_vec;
	vector<bi_word> bi_wordlist_of_a_line;
	bi_word temp;
	//string::iterator it=line.begin(),it1=line.begin();
	int it_last=0,it1;
	bool flag=false;

	line.append(" ");
	for (; is_fengefu(line[it_last])&&(size_t)it_last<line.length(); it_last++);
	for (int it=it_last; line[it]!= '\0'&& (size_t)it<line.length(); it++) {
		if (is_fengefu(line[it])) {

			for (it1 = it_last; it1 - it_last < 4 &&(size_t)it1<line.length(); it1++) {
				if (!is_letter(line[it1])) {
					flag = true;
					break;
				}
			}//判断是否是单词 不是就丢掉
			if (flag == false) {//如果是单词
				/*if(wordlist_of_a_line_vec[0].size<40)
					wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));  //插入进单词列表
				else {
					wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last));
				}*/
				sum++;
				str_now = line.substr(it_last, it - it_last);
				add_a_word(str_now);
				if (str_temp != "") {
					temp.str1 = str_temp;
					temp.str2 = str_now;
					add_a_bi_word(temp);
				}
				str_temp = str_now;
			}
				/*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/
					//wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));
				
			flag = false;
			it_last = it + 1;
		}

	}

	//将单词列表一个个的map到map表中
	/*int ii = 0;
	for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) {

		add_a_word(*it1);//将单词一个一个的add进字典

		if (it1 < wordlist_of_a_line_vec[0].end() - 1) {
			bi_wordlist_of_a_line.push_back(temp);
			bi_wordlist_of_a_line[ii].str1 = *it1;
			bi_wordlist_of_a_line[ii].str2 = *(it1+1);
			ii++;
		}             //插入词组列表
	}                                       //将单词一个一个的add进字典
	for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) {
		add_a_bi_word(*it2);
	}*/
}


/*
将文件中出现频率最高的10个单词统计下来
返回一个vector<word_time>
*/
vector<word_time> the_most_ten() {
	vector<word_time> most_ten(10);
	unordered_map<string, word_time>::iterator it = word_list.begin();
	while (it != word_list.end()) {
		if (it->second.time > most_ten[9].time) {
			if (it->second.time > most_ten[0].time)
				most_ten.insert(most_ten.begin(), it->second);
			else
			for (int ii = 1; ii<=9; ii++) {
				if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) {
					most_ten.insert(most_ten.begin() + ii, it->second);
					break;
				}
			}
			//if(it->second.time > most_ten[0].time)
				//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten.erase(most_ten.begin() + 10, most_ten.end());
	return most_ten;
}

/*
统计文件中的词组数,存入bi_word_list
*/

void add_a_bi_word(bi_word b_word) {
	if (!is_letter(b_word.str1[0])|| !is_letter(b_word.str2[0]))
		return;                                                            //如果word[0]不是字母就return
	bi_word b_word_ = b_word;
	string::iterator it1,it2;
	word_word_time word_word__time;
	it1 = b_word.str1.end();
	it2 = b_word.str2.end();
	it1--; it2--;
	while (!is_letter(*it1)) {
		it1--;
	};																		//*it不是字母
	while (!is_letter(*it2)) {
		it2--;
	};
	b_word.str1.erase(it1 + 1, b_word.str1.end());											//截取前面一部分
	b_word.str2.erase(it2 + 1, b_word.str2.end());
	/*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) {
		if (!is_letter(*it1)) return;
	}																		//如果it前四位不是纯字母，直接
	for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) {
		if (!is_letter(*it2)) return;
	}*/
	transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper);           //转换为大写
	transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper);
	string temp = b_word.str1 + b_word.str2;
	bi_word_list[temp].time++;													//把化简后的word塞入word_list并++次数
	if (bi_word_list[temp].word == "" || (bi_word_list[temp].word+ bi_word_list[temp].word_s).compare(b_word_.str1+b_word_.str2)>0) {
		bi_word_list[temp].word = b_word_.str1;
		bi_word_list[temp].word_s = b_word_.str2;
	}															//如果word_比原来的小 就更新

}


//"      hello fucking333 world hello fuck fuck abc fucking231 \n hello sd"



/*
将文件中出现频率最高的10个词组统计下来
返回一个vector<word_word_time>
*/
vector<word_word_time> the_most_ten_bi() {
	vector<word_word_time> most_ten_bi(10);
	word_word_time temp;
	unordered_map<string, word_word_time>::iterator it = bi_word_list.begin();
	while (it != bi_word_list.end()) {
		/*most_ten_bi[10] = it->second;
		for (int ii = 10; ii >= 1; ii--) {
			if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) {
				temp = most_ten_bi[ii];
				most_ten_bi[ii] = most_ten_bi[ii - 1];
				most_ten_bi[ii - 1] = temp;
			}
		}*/
		if (it->second.time > most_ten_bi[9].time) {
			if (it->second.time > most_ten_bi[0].time)
				most_ten_bi.insert(most_ten_bi.begin(), it->second);
			else
				for (int ii = 1; ii <= 9; ii++) {
					if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) {
						most_ten_bi.insert(most_ten_bi.begin() + ii, it->second);
						break;
					}
				}
			//if(it->second.time > most_ten[0].time)
			//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end());
	return most_ten_bi;
}

/*
深度优先搜索文件夹和子目录
*/
long sum1 = 0;
int line_sum = 0;
void DfsFolder(string path, int layer)
{
	_finddata_t file_info;
	string current_path = path + "/*.*"; //也可以用/*来匹配所有  
	int handle = _findfirst(current_path.c_str(), &file_info);
	//返回值为-1则查找失败  
	ifstream infile;
	string temp, text;
	if (-1 == handle)
	{
		cout << "cannot match the path" << endl;
		return;
	}

	do
	{
		//判断是否子目录  
		if (file_info.attrib == _A_SUBDIR)
		{
			//递归遍历子目录  
			//打印记号反映出深度层次  
			/*for (int i = 0; i<layer; i++)
				cout << "--";
			cout << file_info.name << endl;*/
			int layer_tmp = layer;
			if (strcmp(file_info.name, "..") != 0 && strcmp(file_info.name, ".") != 0)  //.是当前目录，..是上层目录，必须排除掉这两种情况  
				DfsFolder(path + '/' + file_info.name, layer_tmp + 1); //再windows下可以用\\转义分隔符，不推荐  
		}
		else
		{
			//打印记号反映出深度层次  
			/*for (int i = 0; i<layer; i++)
				cout << "--";
			cout << file_info.name << endl;*/
			infile.open(path + '/' + file_info.name, ios::in);
			while (getline(infile, temp)) {
				//text.append(temp);
				//cout << temp << endl;
				sum1 += temp.length();
				line_sum++;
				insert_into_wordlist(temp);
			}
			//insert_into_wordlist(text);
			infile.close();
		}
	} while (!_findnext(handle, &file_info));  //返回0则遍历完  
											   //关闭文件句柄  
	_findclose(handle);
}

int main() {
	DfsFolder("E:/tales", 0);
	word_time test = word_list["THAT"];
	//前十词组
	vector<word_word_time> a=the_most_ten_bi();
	//前十单词
	vector<word_time> b = the_most_ten();
	//字符总数
	sum1;
	//单词总数
	sum;
	//总行数
	line_sum;

	//vector<word_word_time

}


/*
int main() {
	bi_word a, b, c, d, e, f;
	a.str1 = "hello"; a.str2 = "world";
	b.str1 = "hello1"; b.str2 = "world";
	c.str1 = "hello"; c.str2 = "world2";
	d.str1 = "hello"; d.str2 = "fuck33";
	e.str1 = "world"; e.str2 = "hello2";
	f.str1 = "fucking"; f.str2 = "world";
	add_a_bi_word(a);
	add_a_bi_word(b);
	add_a_bi_word(c);
	add_a_bi_word(d);
	add_a_bi_word(e);
	add_a_bi_word(f);
}
*/








/*
int main() {
	string h = "      hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd";
	//string h = "    abc";
	insert_into_wordlist(h);
	vector<word_time> ten_word=the_most_ten();
	vector<word_word_time>ten_bi_word = the_most_ten_bi();
	system("pause");
	return 0;
}
*/


/*
int main() {
	//定义结构体，在查找时，该结构体中存储了查找到文件相应的属性
	_finddata_t file;
	//查找所有文件，如果查找失败，则返回-1；查找成功，返回相应的句柄
	int k;
	long HANDLE;
	k = HANDLE = _findfirst("*.*", &file);
	//根据相应的句柄，可以依次查找下一个文件;直到无法查询到新的文件为止
	while (k != -1) {
		//cout << file.name << endl;  操作函数放在这
		k = _findnext(HANDLE, &file);
	}
	_findclose(HANDLE);
	return 0;
}*/
/*
int main() {
	string word = "StRt123546";
	if (!(word[0] >= 65 && word[0] <= 90 || word[0] >= 97 && word[0] <= 122))
		return 0;
	string word_ = word;
	string::iterator it=word.end();
	it--;
	while (!(*it >= 65 && *it <= 90 || *it >= 97 && *it <= 122)) {
		it--;
	};//*it不是字母
	word.erase(it+1, word.end());
	for (it = word.begin(); it - word.begin() < 4; it++) {
		if (!is_letter(*it)) return 0;
	}
	transform(word.begin(), word.end(), word.begin(), ::toupper);
	word_list[word].time++;													//把化简后的word塞入word_list并++次数
	if (word_list[word].word == "" || word_list[word].word.compare(word_)) {
		word_list[word].word = word_;
	}															//如果word_比原来的小 就更新
	cout << word << endl;
	system("pause");
	return 0;
}*/

测试样例在main()的注释中，这是对每个单元的测试；

总测试集

1、空集

2、只有一个文本文件，且没有内容

3、只有一个文本文件，只有一个空格

4、只有一个文本文件，有一个空格和一个换行符

5、只有一个文本文件（极简情况）

6、有两个文件，一个.txt（文本文件）和一个.pdf（二进制文件）

7、多文件多文件夹

优化：

1、消除重复计算，在insert函数中减少每次的运算次数，代码运行时间减小5%左右；

2、改进读取模式，使用getline，虽然fread更快，但是对于某些样本不稳定，舍弃。代码稍微变快。

3、改进传参效率，使用引用string变量，使得代码运行时间平均减小30%；

4、删除word_list，采用即读即插的形式，使得代码运行时间平均减小10%；

附上最终代码：

#include <io.h>
#include <iostream>
#include <unordered_map>
#include <string>
#include <cctype>
#include <algorithm>
#include <fstream>
#include <time.h>
//#include <iomanip>
using namespace std;


class word_time {
public:
	string word;
	int time;
public:
	word_time() {
		this->word = "";
		this->time = 0;
	}
};

class word_word_time : public word_time {
public:
	string word_s;
	word_word_time() {
		this->time = 0;
		this->word = "";
		this->word_s = "";
	}
	void operator=(const word_word_time &another) {
		this->time = another.time;
		this->word = another.word;
		this->word_s = another.word_s;
	}
};

class bi_word {
public:
	string str1;
	string str2;
	bi_word() {
		this->str1 = "";
		this->str2 = "";
	}

	bi_word(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}

	bool operator==(const bi_word &another) {
		if (this->str1 == another.str1&&this->str2 == another.str2)
			return true;
		return false;
	}

	void operator=(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}
};

unordered_map<string, word_time> word_list;
unordered_map<string, word_word_time> bi_word_list;

/*
判断一个char是不是字母
参数类型: char
*/
bool is_letter(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122)
		return true;
	return false;
}
/*
判断一个char是不是分隔符
*/
bool is_fengefu(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57)
		return false;
	return true;
}

/*
添加一个字母到word_list中，并统计词数
参数类型： string
*/
void add_a_word(string word) {
	if (!is_letter(word[0]))
		return;                                                            //如果word[0]不是字母就return
	string word_ = word;
	string::iterator it;
	word_time word__time;
	it = word.end();
	it--;
	while (!is_letter(*it)) {
		it--;
	};																		//*it不是字母
	word.erase(it + 1, word.end());											//截取前面一部分
																			/*for (it = word.begin(); it - word.begin() < 4; it++) {
																			if (!is_letter(*it)) return;
																			}*/																		//如果it前四位不是纯字母，直接
	transform(word.begin(), word.end(), word.begin(), ::toupper);           //转换为大写
																			//word_time one = word_list[word];
	word_list[word].time++;													//把化简后的word塞入word_list并++次数
	if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) {
		word_list[word].word = word_;
	}															//如果word_比原来的小 就更新

}

/*
统计一行字符数
参数类型:string
*/
int count_char_sum(string str) {
	return(str.length());
}

/*
声明一下add_a_bi_word函数
*/
void add_a_bi_word(bi_word b_word);


/*
将一行的单词输入进word_list,并生成n-1个词组,并将这n-1个词组输入进bi_word_list（其实是个map）
*/
int sum = 0;
string str_temp, str_now;
void insert_into_wordlist(string &line) {
	vector<vector<string>> wordlist_of_a_line_vec;
	vector<bi_word> bi_wordlist_of_a_line;
	bi_word temp;
	//string::iterator it=line.begin(),it1=line.begin();
	int it_last = 0, it1;
	bool flag = false;

	line.append(" ");
	for (; is_fengefu(line[it_last]) && (size_t)it_last<line.length(); it_last++);
	for (int it = it_last; line[it] != '\0' && (size_t)it<line.length(); it++) {
		if (is_fengefu(line[it])) {

			for (it1 = it_last; it1 - it_last < 4 && (size_t)it1<line.length(); it1++) {
				if (!is_letter(line[it1])) {
					flag = true;
					break;
				}
			}//判断是否是单词 不是就丢掉
			if (flag == false) {//如果是单词
								/*if(wordlist_of_a_line_vec[0].size<40)
								wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));  //插入进单词列表
								else {
								wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last));
								}*/
				sum++;
				str_now = line.substr(it_last, it - it_last);
				add_a_word(str_now);
				if (str_temp != "") {
					temp.str1 = str_temp;
					temp.str2 = str_now;
					add_a_bi_word(temp);
				}
				str_temp = str_now;
			}
			/*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/
			//wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));

			flag = false;
			it_last = it + 1;
		}

	}

	//将单词列表一个个的map到map表中
	/*int ii = 0;
	for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) {

	add_a_word(*it1);//将单词一个一个的add进字典

	if (it1 < wordlist_of_a_line_vec[0].end() - 1) {
	bi_wordlist_of_a_line.push_back(temp);
	bi_wordlist_of_a_line[ii].str1 = *it1;
	bi_wordlist_of_a_line[ii].str2 = *(it1+1);
	ii++;
	}             //插入词组列表
	}                                       //将单词一个一个的add进字典
	for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) {
	add_a_bi_word(*it2);
	}*/
}


/*
将文件中出现频率最高的10个单词统计下来
返回一个vector<word_time>
*/
vector<word_time> the_most_ten() {
	vector<word_time> most_ten(10);
	unordered_map<string, word_time>::iterator it = word_list.begin();
	while (it != word_list.end()) {
		if (it->second.time > most_ten[9].time) {
			if (it->second.time > most_ten[0].time)
				most_ten.insert(most_ten.begin(), it->second);
			else
				for (int ii = 1; ii <= 9; ii++) {
					if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) {
						most_ten.insert(most_ten.begin() + ii, it->second);
						break;
					}
				}
			//if(it->second.time > most_ten[0].time)
			//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten.erase(most_ten.begin() + 10, most_ten.end());
	return most_ten;
}

/*
统计文件中的词组数,存入bi_word_list
*/

void add_a_bi_word(bi_word b_word) {
	if (!is_letter(b_word.str1[0]) || !is_letter(b_word.str2[0]))
		return;                                                            //如果word[0]不是字母就return
	bi_word b_word_ = b_word;
	string::iterator it1, it2;
	word_word_time word_word__time;
	it1 = b_word.str1.end();
	it2 = b_word.str2.end();
	it1--; it2--;
	while (!is_letter(*it1)) {
		it1--;
	};																		//*it不是字母
	while (!is_letter(*it2)) {
		it2--;
	};
	b_word.str1.erase(it1 + 1, b_word.str1.end());											//截取前面一部分
	b_word.str2.erase(it2 + 1, b_word.str2.end());
	/*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) {
	if (!is_letter(*it1)) return;
	}																		//如果it前四位不是纯字母，直接
	for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) {
	if (!is_letter(*it2)) return;
	}*/
	//transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper);           //转换为大写
	//transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper);
	for (string::iterator itfirst = b_word.str1.begin(); itfirst < b_word.str1.end(); itfirst++) {
		if (*itfirst >= 'a') *itfirst -= 32;
	}
	for (string::iterator itsecond = b_word.str2.begin(); itsecond < b_word.str2.end(); itsecond++) {
		if (*itsecond >= 'a') *itsecond -= 32;
	}
	string temp = b_word.str1 + b_word.str2;
	bi_word_list[temp].time++;													//把化简后的word塞入word_list并++次数
	if (bi_word_list[temp].word == "" || (bi_word_list[temp].word + bi_word_list[temp].word_s).compare(b_word_.str1 + b_word_.str2)>0) {
		bi_word_list[temp].word = b_word_.str1;
		bi_word_list[temp].word_s = b_word_.str2;
	}															//如果word_比原来的小 就更新

}


//"      hello fucking333 world hello fuck fuck abc fucking231 \n hello sd"



/*
将文件中出现频率最高的10个词组统计下来
返回一个vector<word_word_time>
*/
vector<word_word_time> the_most_ten_bi() {
	vector<word_word_time> most_ten_bi(10);
	word_word_time temp;
	unordered_map<string, word_word_time>::iterator it = bi_word_list.begin();
	while (it != bi_word_list.end()) {
		/*most_ten_bi[10] = it->second;
		for (int ii = 10; ii >= 1; ii--) {
		if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) {
		temp = most_ten_bi[ii];
		most_ten_bi[ii] = most_ten_bi[ii - 1];
		most_ten_bi[ii - 1] = temp;
		}
		}*/
		if (it->second.time > most_ten_bi[9].time) {
			if (it->second.time > most_ten_bi[0].time)
				most_ten_bi.insert(most_ten_bi.begin(), it->second);
			else
				for (int ii = 1; ii <= 9; ii++) {
					if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) {
						most_ten_bi.insert(most_ten_bi.begin() + ii, it->second);
						break;
					}
				}
			//if(it->second.time > most_ten[0].time)
			//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end());
	return most_ten_bi;
}

/*
深度优先搜索文件夹和子目录
*/
long sum1 = 0;
int line_sum = 0;
void DfsFolder(string path, int layer)
{
	_finddata_t file_info;
	string current_path = path + "/*.*"; //也可以用/*来匹配所有  
	int handle = _findfirst(current_path.c_str(), &file_info);
	//返回值为-1则查找失败  
	ifstream infile;
	string temp, text;
	if (-1 == handle)
	{
		cout << "cannot match the path" << endl;
		return;
	}

	do
	{
		//判断是否子目录  
		if (file_info.attrib == _A_SUBDIR)
		{
			//递归遍历子目录  
			//打印记号反映出深度层次  
			/*for (int i = 0; i<layer; i++)
			cout << "--";
			cout << file_info.name << endl;*/
			int layer_tmp = layer;
			if (strcmp(file_info.name, "..") != 0 && strcmp(file_info.name, ".") != 0)  //.是当前目录，..是上层目录，必须排除掉这两种情况  
				DfsFolder(path + '/' + file_info.name, layer_tmp + 1); //再windows下可以用\\转义分隔符，不推荐  
		}
		else
		{
			//打印记号反映出深度层次  
			/*for (int i = 0; i<layer; i++)
			cout << "--";
			cout << file_info.name << endl;*/
			infile.open(path + '/' + file_info.name, ios::in);
			//line_sum++;
			/*infile.seekg(0, ios::end);
			if (infile.get() == '\n')
				line_sum++;
			infile.seekg(0, ios::beg);*/
			while (getline(infile, temp)) {
				//text.append(temp);
				//cout << temp << endl;
				sum1 += temp.length();
				//if (temp.length()!=0)
				line_sum++;
				insert_into_wordlist(temp);
			}
			if (temp == "")line_sum++;
			//insert_into_wordlist(text);
			infile.close();
		}
	} while (!_findnext(handle, &file_info));  //返回0则遍历完  
											   //关闭文件句柄  
	_findclose(handle);
}

int main(int argc, char * argv[]) {
	//clock_t startTime, endTime;
	//startTime = clock();
	string path = argv[1];
	DfsFolder(path, 0);
	//DfsFolder("E:/Samples", 0);
	ofstream outfile;
	outfile.open("result.out", ios::out);
	//outfile.flags(ios::left);
	outfile << "char_number :" << sum1 << endl;
	outfile << "line_number :" << line_sum << endl;
	outfile << "word_number :" << sum << endl;
	outfile << endl;
	//outfile.open("result.out", ios::out);
	vector<word_word_time> a = the_most_ten_bi();
	outfile << "the top ten frequency of phrase :" << endl;
	for (int ii = 0; ii < 10; ii++)
		outfile << a[ii].word << ' ' << a[ii].word_s <<' '<< a[ii].time << endl;
	vector<word_time> b = the_most_ten();
	outfile << endl;
	outfile << "the top ten frequency of word :" << endl;
	for (int ii = 0; ii < 10; ii++)
		outfile << b[ii].word << b[ii].time << endl;
	outfile.close();
	//endTime = clock();
	//cout << "Totle Time : " << (double)(endTime - startTime) / CLOCKS_PER_SEC << "s" << endl;
	return 0;
}

/*
int main() {
//time_t start = clock();
DfsFolder("E:/Samples", 0);
//word_time test = word_list["THAT"];
//前十词组
vector<word_word_time> a=the_most_ten_bi();
cout << "bi_word_most" << endl;
for (int ii = 0; ii < 10; ii++)
cout << a[ii].word << ' ' << a[ii].word_s << ' ' << a[ii].time << endl;
//前十单词
vector<word_time> b = the_most_ten();
cout << "word_most" << endl;
for (int ii = 0; ii < 10; ii++)
cout << b[ii].word << ' ' << b[ii].time << endl;
//字符总数
cout << "char_sum" << endl;
cout << sum1 << endl;
//单词总数
cout << "word_sum" << endl;
cout << sum << endl;
//总行数
cout << "line_sum" << endl;
cout << line_sum << endl;
//time_t end = clock();
//cout << "xunxingshijian" << double(start - end) << endl;
system("pause");

//vector<word_word_time

}*/


/*
int main() {
bi_word a, b, c, d, e, f;
a.str1 = "hello"; a.str2 = "world";
b.str1 = "hello1"; b.str2 = "world";
c.str1 = "hello"; c.str2 = "world2";
d.str1 = "hello"; d.str2 = "fuck33";
e.str1 = "world"; e.str2 = "hello2";
f.str1 = "fucking"; f.str2 = "world";
add_a_bi_word(a);
add_a_bi_word(b);
add_a_bi_word(c);
add_a_bi_word(d);
add_a_bi_word(e);
add_a_bi_word(f);
}
*/








/*
int main() {
string h = "      hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd";
//string h = "    abc";
insert_into_wordlist(h);
vector<word_time> ten_word=the_most_ten();
vector<word_word_time>ten_bi_word = the_most_ten_bi();
system("pause");
return 0;
}
*/


/*
int main() {
//定义结构体，在查找时，该结构体中存储了查找到文件相应的属性
_finddata_t file;
//查找所有文件，如果查找失败，则返回-1；查找成功，返回相应的句柄
int k;
long HANDLE;
k = HANDLE = _findfirst("*.*", &file);
//根据相应的句柄，可以依次查找下一个文件;直到无法查询到新的文件为止
while (k != -1) {
//cout << file.name << endl;  操作函数放在这
k = _findnext(HANDLE, &file);
}
_findclose(HANDLE);
return 0;
}*/
/*
int main() {
string word = "StRt123546";
if (!(word[0] >= 65 && word[0] <= 90 || word[0] >= 97 && word[0] <= 122))
return 0;
string word_ = word;
string::iterator it=word.end();
it--;
while (!(*it >= 65 && *it <= 90 || *it >= 97 && *it <= 122)) {
it--;
};//*it不是字母
word.erase(it+1, word.end());
for (it = word.begin(); it - word.begin() < 4; it++) {
if (!is_letter(*it)) return 0;
}
transform(word.begin(), word.end(), word.begin(), ::toupper);
word_list[word].time++;													//把化简后的word塞入word_list并++次数
if (word_list[word].word == "" || word_list[word].word.compare(word_)) {
word_list[word].word = word_;
}															//如果word_比原来的小 就更新
cout << word << endl;
system("pause");
return 0;
}*/

现在的调用树热行：

发现还是insert函数过于吃内存，然后看到最底端

发现还是hash_map过于吃CPU了，但是这是数据结构本身的问题了，在调用数据结构的时候这个耗时已经决定了。可以说是优化的终点。

Linux下代码：

#include <dirent.h>
#include <sys/stat.h>
#include <iostream>
#include <unordered_map>
#include <string>
#include <cctype>
#include <algorithm>
#include <fstream>
#include<time.h>
#include <iomanip>
using namespace std;


class word_time {
public:
	string word;
	int time;
public:
	word_time(){
		this->word = "";
		this->time = 0;
	}
};

class word_word_time : public word_time {
public:
	string word_s;
	word_word_time() {
		this->time = 0;
		this->word = "";
		this->word_s = "";
	}
	void operator=(const word_word_time &another) {
		this->time = another.time;
		this->word = another.word;
		this->word_s = another.word_s;
	}
};

class bi_word {
public:
	string str1;
	string str2;
	bi_word() {
		this->str1 = "";
		this->str2 = "";
	}

	bi_word(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}

	bool operator==(const bi_word &another) {
		if (this->str1 == another.str1&&this->str2 == another.str2)
			return true;
		return false;
	}

	void operator=(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}
};

unordered_map<string, word_time> word_list;
unordered_map<string, word_word_time> bi_word_list;

/*
�ж�һ��char�ǲ�����ĸ
��������: char
*/
bool is_letter(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122)
		return true;
	return false;
}
/*
�ж�һ��char�ǲ��Ƿָ��
*/
bool is_fengefu(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57)
		return false;
	return true;
}

/*
���һ����ĸ��word_list�У���ͳ�ƴ���
�������ͣ� string
*/
void add_a_word(string word) {
	if (!is_letter(word[0]))
		return;                                                            //���word[0]������ĸ��return
	string word_ = word;
	string::iterator it;
	word_time word__time;
	it = word.end();
	it--;
	while (!is_letter(*it)) {
		it--;
	};																		//*it������ĸ
	word.erase(it+1, word.end());											//��ȡǰ��һ����
	/*for (it = word.begin(); it - word.begin() < 4; it++) {
		if (!is_letter(*it)) return;
	}*/																		//���itǰ��λ���Ǵ���ĸ��ֱ��
	transform(word.begin(), word.end(), word.begin(), ::toupper);           //ת��Ϊ��д
	//word_time one = word_list[word];
	word_list[word].time++;													//�ѻ�����word����word_list��++����
	if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) {
		word_list[word].word = word_;
	}															//���word_��ԭ����С �͸���

}

/*
ͳ��һ���ַ���
��������:string
*/
int count_char_sum(string str) {
	return(str.length());
}

/*
����һ��add_a_bi_word����
*/
void add_a_bi_word(bi_word b_word);


/*
��һ�еĵ��������word_list,������n-1������,������n-1�����������bi_word_list����ʵ�Ǹ�map��
*/
int sum=0;
string str_temp,str_now;
void insert_into_wordlist(string &line) {
	vector<vector<string>> wordlist_of_a_line_vec;
	vector<bi_word> bi_wordlist_of_a_line;
	bi_word temp;
	//string::iterator it=line.begin(),it1=line.begin();
	int it_last=0,it1;
	bool flag=false;

	line.append(" ");
	for (; is_fengefu(line[it_last])&&(size_t)it_last<line.length(); it_last++);
	for (int it=it_last; line[it]!= '\0'&& (size_t)it<line.length(); it++) {
		if (is_fengefu(line[it])) {

			for (it1 = it_last; it1 - it_last < 4 &&(size_t)it1<line.length(); it1++) {
				if (!is_letter(line[it1])) {
					flag = true;
					break;
				}
			}//�ж��Ƿ��ǵ��� ���ǾͶ���
			if (flag == false) {//����ǵ���
				/*if(wordlist_of_a_line_vec[0].size<40)
					wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));  //����������б�
				else {
					wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last));
				}*/
				sum++;
				str_now = line.substr(it_last, it - it_last);
				add_a_word(str_now);
				if (str_temp != "") {
					temp.str1 = str_temp;
					temp.str2 = str_now;
					add_a_bi_word(temp);
				}
				str_temp = str_now;
			}
				/*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/
					//wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));

			flag = false;
			it_last = it + 1;
		}

	}

	//�������б�һ������map��map����
	/*int ii = 0;
	for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) {

		add_a_word(*it1);//������һ��һ����add���ֵ�

		if (it1 < wordlist_of_a_line_vec[0].end() - 1) {
			bi_wordlist_of_a_line.push_back(temp);
			bi_wordlist_of_a_line[ii].str1 = *it1;
			bi_wordlist_of_a_line[ii].str2 = *(it1+1);
			ii++;
		}             //��������б�
	}                                       //������һ��һ����add���ֵ�
	for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) {
		add_a_bi_word(*it2);
	}*/
}


/*
���ļ��г���Ƶ����ߵ�10������ͳ������
����һ��vector<word_time>
*/
vector<word_time> the_most_ten() {
	vector<word_time> most_ten(10);
	unordered_map<string, word_time>::iterator it = word_list.begin();
	while (it != word_list.end()) {
		if (it->second.time > most_ten[9].time) {
			if (it->second.time > most_ten[0].time)
				most_ten.insert(most_ten.begin(), it->second);
			else
			for (int ii = 1; ii<=9; ii++) {
				if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) {
					most_ten.insert(most_ten.begin() + ii, it->second);
					break;
				}
			}
			//if(it->second.time > most_ten[0].time)
				//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten.erase(most_ten.begin() + 10, most_ten.end());
	return most_ten;
}

/*
ͳ���ļ��еĴ�����,����bi_word_list
*/

void add_a_bi_word(bi_word b_word) {
	if (!is_letter(b_word.str1[0])|| !is_letter(b_word.str2[0]))
		return;                                                            //���word[0]������ĸ��return
	bi_word b_word_ = b_word;
	string::iterator it1,it2;
	word_word_time word_word__time;
	it1 = b_word.str1.end();
	it2 = b_word.str2.end();
	it1--; it2--;
	while (!is_letter(*it1)) {
		it1--;
	};																		//*it������ĸ
	while (!is_letter(*it2)) {
		it2--;
	};
	b_word.str1.erase(it1 + 1, b_word.str1.end());											//��ȡǰ��һ����
	b_word.str2.erase(it2 + 1, b_word.str2.end());
	/*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) {
		if (!is_letter(*it1)) return;
	}																		//���itǰ��λ���Ǵ���ĸ��ֱ��
	for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) {
		if (!is_letter(*it2)) return;
	}*/
	//transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper);           //ת��Ϊ��д
	//transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper);
	for (string::iterator itfirst = b_word.str1.begin(); itfirst < b_word.str1.end(); itfirst++) {
		if (*itfirst >= 'a') *itfirst -= 32;
	}
	for (string::iterator itsecond = b_word.str2.begin(); itsecond < b_word.str2.end(); itsecond++) {
		if (*itsecond >= 'a') *itsecond -= 32;
	}
	string temp = b_word.str1 + b_word.str2;
	bi_word_list[temp].time++;													//�ѻ�����word����word_list��++����
	if (bi_word_list[temp].word == "" || (bi_word_list[temp].word+ bi_word_list[temp].word_s).compare(b_word_.str1+b_word_.str2)>0) {
		bi_word_list[temp].word = b_word_.str1;
		bi_word_list[temp].word_s = b_word_.str2;
	}															//���word_��ԭ����С �͸���

}


//"      hello fucking333 world hello fuck fuck abc fucking231 \n hello sd"



/*
���ļ��г���Ƶ����ߵ�10������ͳ������
����һ��vector<word_word_time>
*/
vector<word_word_time> the_most_ten_bi() {
	vector<word_word_time> most_ten_bi(10);
	word_word_time temp;
	unordered_map<string, word_word_time>::iterator it = bi_word_list.begin();
	while (it != bi_word_list.end()) {
		/*most_ten_bi[10] = it->second;
		for (int ii = 10; ii >= 1; ii--) {
			if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) {
				temp = most_ten_bi[ii];
				most_ten_bi[ii] = most_ten_bi[ii - 1];
				most_ten_bi[ii - 1] = temp;
			}
		}*/
		if (it->second.time > most_ten_bi[9].time) {
			if (it->second.time > most_ten_bi[0].time)
				most_ten_bi.insert(most_ten_bi.begin(), it->second);
			else
				for (int ii = 1; ii <= 9; ii++) {
					if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) {
						most_ten_bi.insert(most_ten_bi.begin() + ii, it->second);
						break;
					}
				}
			//if(it->second.time > most_ten[0].time)
			//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end());
	return most_ten_bi;
}

/*
������������ļ��к���Ŀ¼
*/
long sum1 = 0;
int line_sum = 0;
void DfsFolder(string lname)
{
DIR *dir_ptr;
    struct stat infobuf;
    struct dirent *direntp;
    string name, temp;
    ifstream infile;
    string  text;
    if ((dir_ptr = opendir(lname.c_str())) == NULL)
        perror("can not open");
    else
    {
        while ((direntp = readdir(dir_ptr)) != NULL)
        {
            temp = "";
            name = direntp->d_name;
            if (name == "." || name==".." )
            {
                ;
            }
            else
            {
                temp+=lname;
                temp+="/";
                temp+=name;

                //strcat(temp, lname);
                //strcat(temp, "/");
                //strcat(temp, name);
                if ((stat(temp.c_str(), &infobuf)) == -1)
                    printf("#########\n");
                if ((infobuf.st_mode & 0170000) == 0040000)
                {
                    //printf("%s",name);
                    //printf("  this is a directory\n");
                    DfsFolder(temp);
                }
                else
                {
                    //printf("%s",name);
                    //printf("  this is a file\n");
                    infile.open(temp, ios::in);
                    //line_sum++;
                    while (getline(infile, text)) {
                        //text.append(temp);
                        //cout << temp << endl;
                        sum1 += text.length();
                        line_sum++;
                        insert_into_wordlist(text);
                    }
                    if(temp == "") line_sum++;
                    //insert_into_wordlist(text);
                    infile.close();
                }
            }
        }
    }
    closedir(dir_ptr);
}

int main(int argc, char * argv[]) {
    string path=argv[1];
	DfsFolder(path);
    ofstream outfile;
    outfile.open("result.out", ios::out);
    //outfile.flags(ios::left);
	outfile << "char_number :" << sum1 << endl;
    outfile << "line_number :" << line_sum << endl;
	outfile << "word_number :" << sum << endl;
    outfile <<  endl;
    //outfile.open("result.out", ios::out);
    vector<word_word_time> a=the_most_ten_bi();
    outfile << "the top ten frequency of phrase :" << endl;
    for (int ii = 0; ii < 10; ii++)
        outfile <<   a[ii].word << ' ' << a[ii].word_s <<setw(10) << a[ii].time << endl;
    vector<word_time> b = the_most_ten();
    outfile << endl;
    outfile << "the top ten frequency of word :" << endl;
    for (int ii = 0; ii < 10; ii++)
        outfile  <<  b[ii].word << setw(10)  << b[ii].time << endl;
    outfile.close();
}


/*
int main() {
	bi_word a, b, c, d, e, f;
	a.str1 = "hello"; a.str2 = "world";
	b.str1 = "hello1"; b.str2 = "world";
	c.str1 = "hello"; c.str2 = "world2";
	d.str1 = "hello"; d.str2 = "fuck33";
	e.str1 = "world"; e.str2 = "hello2";
	f.str1 = "fucking"; f.str2 = "world";
	add_a_bi_word(a);
	add_a_bi_word(b);
	add_a_bi_word(c);
	add_a_bi_word(d);
	add_a_bi_word(e);
	add_a_bi_word(f);
}
*/








/*
int main() {
	string h = "      hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd";
	//string h = "    abc";
	insert_into_wordlist(h);
	vector<word_time> ten_word=the_most_ten();
	vector<word_word_time>ten_bi_word = the_most_ten_bi();
	system("pause");
	return 0;
}
*/


/*
int main() {
	//����ṹ�壬�ڲ���ʱ���ýṹ���д洢�˲��ҵ��ļ���Ӧ������
	_finddata_t file;
	//���������ļ����������ʧ�ܣ��

用grof进行代码分析:

可以看到跑了之后还是在unordered_map的Hash函数占用了很多CPU 占用很多内存

在调用树的层面上，可以看到遍历、插入unmap的过程非常耗时，但是由于stl的局限性，也有同学试着自己写hash，发现效果还没有stl好。

总结：

这个项目主要是通过unmap模拟字典，当然其实这个项目用python实现效果会更好。所有的单词化简后存入unmap，这样豁免的统计会很方便。

用unorderedmap实现词频统计

猜你喜欢