用unorderedmap实现词频统计

前言:

        软工第一次作业是实现对文件夹中文件的词频统计,具体要求在博客中。

        题目:软工第一次作业

需求:

       

1. 统计文件的字符数

2. 统计文件的单词总数

3. 统计文件的总行数

4. 统计文件中各单词的出现次数

5. 对给定文件夹及其递归子文件夹下的所有文件进行统计

6. 统计两个单词(词组)在一起的频率,输出频率最高的前10个

PSP:

PSP表格
PSP Personal Software Process Stages Time /h



Planning 计划 20  
Estimate 估计这个任务需要多少时间 20  
Development 开发 52  
Analysis 需求分析 (包括学习新技术) 2  
 Design Spec 生成设计文档 1  
Design 具体设计 4  
Coding 具体编码 13  
Code Review 代码复审 2  
Test 测试(自我测试,修改代码,提交修改) 30  
Reporting 报告 10  
Test Report 测试报告 6  
Size Measurement 计算工作量 2  
Postmortem & Process Improvement Plan 事后总结并提出过程改进计划 2  
       

思路:

        一开始准备写个字典树,字典树的搜索效率是O(n)。转念一想,为什么不用Hash呢,Hash的查找只需要O(1),于是准备一鼓作气写个hash。Google如何写一个perfect的hash的时候,某位大佬提到了unordered_map。(在这感谢这位不愿意透露姓名的赵r大佬)。

        然后Google Unordered_map 看到最全面的一篇blog就是这个,博主讲的很详细。

        又恶补了一通map的用法,详情戳这里

        好了开始动工~

class word_time {
public:
	string word; //最后输出的按照字典顺序的字母
	int time;    //单词出现的次数
public:
	word_time(){   //构造函数,init
		this->word = "";
		this->time = 0;
	}
};
unordered_map<string, word_time> word_list;   //从“一个单词”到“单词最简形式和出现次数构成的类”的一个字典

主要思想就是这样了,废话不多说,直接上代码~

代码:

        

#include <io.h>
#include <iostream>
#include <unordered_map>
#include <string>
#include <cctype>
#include <algorithm>
#include <fstream>
using namespace std;


class word_time {
public:
	string word;
	int time;
public:
	word_time(){
		this->word = "";
		this->time = 0;
	}
};

class word_word_time : public word_time {
public:
	string word_s;
	word_word_time() {
		this->time = 0;
		this->word = "";
		this->word_s = "";
	}
	void operator=(const word_word_time &another) {
		this->time = another.time;
		this->word = another.word;
		this->word_s = another.word_s;
	}
};

class bi_word {
public:
	string str1;
	string str2;
	bi_word() {
		this->str1 = "";
		this->str2 = "";
	}

	bi_word(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}

	bool operator==(const bi_word &another) {
		if (this->str1 == another.str1&&this->str2 == another.str2)
			return true;
		return false;
	}

	void operator=(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}
};

unordered_map<string, word_time> word_list;
unordered_map<string, word_word_time> bi_word_list;

/*
判断一个char是不是字母
参数类型: char
*/
bool is_letter(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122)
		return true;
	return false;
}
/*
判断一个char是不是分隔符
*/
bool is_fengefu(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57)
		return false;
	return true;
}

/*
添加一个字母到word_list中,并统计词数
参数类型: string 
*/
void add_a_word(string word) {
	if (!is_letter(word[0]))
		return;                                                            //如果word[0]不是字母就return
	string word_ = word;
	string::iterator it;
	word_time word__time;
	it = word.end();
	it--;
	while (!is_letter(*it)) {
		it--;
	};																		//*it不是字母
	word.erase(it+1, word.end());											//截取前面一部分
	/*for (it = word.begin(); it - word.begin() < 4; it++) {
		if (!is_letter(*it)) return;
	}*/																		//如果it前四位不是纯字母,直接
	transform(word.begin(), word.end(), word.begin(), ::toupper);           //转换为大写
	//word_time one = word_list[word];
	word_list[word].time++;													//把化简后的word塞入word_list并++次数
	if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) {
		word_list[word].word = word_;
	}															//如果word_比原来的小 就更新

}

/*
统计一行字符数
参数类型:string
*/
int count_char_sum(string str) {
	return(str.length());
}

/*
声明一下add_a_bi_word函数
*/
void add_a_bi_word(bi_word b_word);


/*
将一行的单词输入进word_list,并生成n-1个词组,并将这n-1个词组输入进bi_word_list(其实是个map)
*/
int sum=0;
string str_temp,str_now;
void insert_into_wordlist(string &line) {
	vector<vector<string>> wordlist_of_a_line_vec;
	vector<bi_word> bi_wordlist_of_a_line;
	bi_word temp;
	//string::iterator it=line.begin(),it1=line.begin();
	int it_last=0,it1;
	bool flag=false;

	line.append(" ");
	for (; is_fengefu(line[it_last])&&(size_t)it_last<line.length(); it_last++);
	for (int it=it_last; line[it]!= '\0'&& (size_t)it<line.length(); it++) {
		if (is_fengefu(line[it])) {

			for (it1 = it_last; it1 - it_last < 4 &&(size_t)it1<line.length(); it1++) {
				if (!is_letter(line[it1])) {
					flag = true;
					break;
				}
			}//判断是否是单词 不是就丢掉
			if (flag == false) {//如果是单词
				/*if(wordlist_of_a_line_vec[0].size<40)
					wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));  //插入进单词列表
				else {
					wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last));
				}*/
				sum++;
				str_now = line.substr(it_last, it - it_last);
				add_a_word(str_now);
				if (str_temp != "") {
					temp.str1 = str_temp;
					temp.str2 = str_now;
					add_a_bi_word(temp);
				}
				str_temp = str_now;
			}
				/*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/
					//wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));
				
			flag = false;
			it_last = it + 1;
		}

	}

	//将单词列表一个个的map到map表中
	/*int ii = 0;
	for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) {

		add_a_word(*it1);//将单词一个一个的add进字典

		if (it1 < wordlist_of_a_line_vec[0].end() - 1) {
			bi_wordlist_of_a_line.push_back(temp);
			bi_wordlist_of_a_line[ii].str1 = *it1;
			bi_wordlist_of_a_line[ii].str2 = *(it1+1);
			ii++;
		}             //插入词组列表
	}                                       //将单词一个一个的add进字典
	for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) {
		add_a_bi_word(*it2);
	}*/
}


/*
将文件中出现频率最高的10个单词统计下来
返回一个vector<word_time>
*/
vector<word_time> the_most_ten() {
	vector<word_time> most_ten(10);
	unordered_map<string, word_time>::iterator it = word_list.begin();
	while (it != word_list.end()) {
		if (it->second.time > most_ten[9].time) {
			if (it->second.time > most_ten[0].time)
				most_ten.insert(most_ten.begin(), it->second);
			else
			for (int ii = 1; ii<=9; ii++) {
				if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) {
					most_ten.insert(most_ten.begin() + ii, it->second);
					break;
				}
			}
			//if(it->second.time > most_ten[0].time)
				//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten.erase(most_ten.begin() + 10, most_ten.end());
	return most_ten;
}

/*
统计文件中的词组数,存入bi_word_list
*/

void add_a_bi_word(bi_word b_word) {
	if (!is_letter(b_word.str1[0])|| !is_letter(b_word.str2[0]))
		return;                                                            //如果word[0]不是字母就return
	bi_word b_word_ = b_word;
	string::iterator it1,it2;
	word_word_time word_word__time;
	it1 = b_word.str1.end();
	it2 = b_word.str2.end();
	it1--; it2--;
	while (!is_letter(*it1)) {
		it1--;
	};																		//*it不是字母
	while (!is_letter(*it2)) {
		it2--;
	};
	b_word.str1.erase(it1 + 1, b_word.str1.end());											//截取前面一部分
	b_word.str2.erase(it2 + 1, b_word.str2.end());
	/*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) {
		if (!is_letter(*it1)) return;
	}																		//如果it前四位不是纯字母,直接
	for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) {
		if (!is_letter(*it2)) return;
	}*/
	transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper);           //转换为大写
	transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper);
	string temp = b_word.str1 + b_word.str2;
	bi_word_list[temp].time++;													//把化简后的word塞入word_list并++次数
	if (bi_word_list[temp].word == "" || (bi_word_list[temp].word+ bi_word_list[temp].word_s).compare(b_word_.str1+b_word_.str2)>0) {
		bi_word_list[temp].word = b_word_.str1;
		bi_word_list[temp].word_s = b_word_.str2;
	}															//如果word_比原来的小 就更新

}


//"      hello fucking333 world hello fuck fuck abc fucking231 \n hello sd"



/*
将文件中出现频率最高的10个词组统计下来
返回一个vector<word_word_time>
*/
vector<word_word_time> the_most_ten_bi() {
	vector<word_word_time> most_ten_bi(10);
	word_word_time temp;
	unordered_map<string, word_word_time>::iterator it = bi_word_list.begin();
	while (it != bi_word_list.end()) {
		/*most_ten_bi[10] = it->second;
		for (int ii = 10; ii >= 1; ii--) {
			if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) {
				temp = most_ten_bi[ii];
				most_ten_bi[ii] = most_ten_bi[ii - 1];
				most_ten_bi[ii - 1] = temp;
			}
		}*/
		if (it->second.time > most_ten_bi[9].time) {
			if (it->second.time > most_ten_bi[0].time)
				most_ten_bi.insert(most_ten_bi.begin(), it->second);
			else
				for (int ii = 1; ii <= 9; ii++) {
					if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) {
						most_ten_bi.insert(most_ten_bi.begin() + ii, it->second);
						break;
					}
				}
			//if(it->second.time > most_ten[0].time)
			//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end());
	return most_ten_bi;
}

/*
深度优先搜索文件夹和子目录
*/
long sum1 = 0;
int line_sum = 0;
void DfsFolder(string path, int layer)
{
	_finddata_t file_info;
	string current_path = path + "/*.*"; //也可以用/*来匹配所有  
	int handle = _findfirst(current_path.c_str(), &file_info);
	//返回值为-1则查找失败  
	ifstream infile;
	string temp, text;
	if (-1 == handle)
	{
		cout << "cannot match the path" << endl;
		return;
	}

	do
	{
		//判断是否子目录  
		if (file_info.attrib == _A_SUBDIR)
		{
			//递归遍历子目录  
			//打印记号反映出深度层次  
			/*for (int i = 0; i<layer; i++)
				cout << "--";
			cout << file_info.name << endl;*/
			int layer_tmp = layer;
			if (strcmp(file_info.name, "..") != 0 && strcmp(file_info.name, ".") != 0)  //.是当前目录,..是上层目录,必须排除掉这两种情况  
				DfsFolder(path + '/' + file_info.name, layer_tmp + 1); //再windows下可以用\\转义分隔符,不推荐  
		}
		else
		{
			//打印记号反映出深度层次  
			/*for (int i = 0; i<layer; i++)
				cout << "--";
			cout << file_info.name << endl;*/
			infile.open(path + '/' + file_info.name, ios::in);
			while (getline(infile, temp)) {
				//text.append(temp);
				//cout << temp << endl;
				sum1 += temp.length();
				line_sum++;
				insert_into_wordlist(temp);
			}
			//insert_into_wordlist(text);
			infile.close();
		}
	} while (!_findnext(handle, &file_info));  //返回0则遍历完  
											   //关闭文件句柄  
	_findclose(handle);
}

int main() {
	DfsFolder("E:/tales", 0);
	word_time test = word_list["THAT"];
	//前十词组
	vector<word_word_time> a=the_most_ten_bi();
	//前十单词
	vector<word_time> b = the_most_ten();
	//字符总数
	sum1;
	//单词总数
	sum;
	//总行数
	line_sum;

	//vector<word_word_time

}


/*
int main() {
	bi_word a, b, c, d, e, f;
	a.str1 = "hello"; a.str2 = "world";
	b.str1 = "hello1"; b.str2 = "world";
	c.str1 = "hello"; c.str2 = "world2";
	d.str1 = "hello"; d.str2 = "fuck33";
	e.str1 = "world"; e.str2 = "hello2";
	f.str1 = "fucking"; f.str2 = "world";
	add_a_bi_word(a);
	add_a_bi_word(b);
	add_a_bi_word(c);
	add_a_bi_word(d);
	add_a_bi_word(e);
	add_a_bi_word(f);
}
*/








/*
int main() {
	string h = "      hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd";
	//string h = "    abc";
	insert_into_wordlist(h);
	vector<word_time> ten_word=the_most_ten();
	vector<word_word_time>ten_bi_word = the_most_ten_bi();
	system("pause");
	return 0;
}
*/


/*
int main() {
	//定义结构体,在查找时,该结构体中存储了查找到文件相应的属性
	_finddata_t file;
	//查找所有文件,如果查找失败,则返回-1;查找成功,返回相应的句柄
	int k;
	long HANDLE;
	k = HANDLE = _findfirst("*.*", &file);
	//根据相应的句柄,可以依次查找下一个文件;直到无法查询到新的文件为止
	while (k != -1) {
		//cout << file.name << endl;  操作函数放在这
		k = _findnext(HANDLE, &file);
	}
	_findclose(HANDLE);
	return 0;
}*/
/*
int main() {
	string word = "StRt123546";
	if (!(word[0] >= 65 && word[0] <= 90 || word[0] >= 97 && word[0] <= 122))
		return 0;
	string word_ = word;
	string::iterator it=word.end();
	it--;
	while (!(*it >= 65 && *it <= 90 || *it >= 97 && *it <= 122)) {
		it--;
	};//*it不是字母
	word.erase(it+1, word.end());
	for (it = word.begin(); it - word.begin() < 4; it++) {
		if (!is_letter(*it)) return 0;
	}
	transform(word.begin(), word.end(), word.begin(), ::toupper);
	word_list[word].time++;													//把化简后的word塞入word_list并++次数
	if (word_list[word].word == "" || word_list[word].word.compare(word_)) {
		word_list[word].word = word_;
	}															//如果word_比原来的小 就更新
	cout << word << endl;
	system("pause");
	return 0;
}*/

测试样例在main()的注释中,这是对每个单元的测试;

总测试集

1、空集


2、只有一个文本文件,且没有内容


3、只有一个文本文件,只有一个空格


4、只有一个文本文件,有一个空格和一个换行符


5、只有一个文本文件(极简情况)



6、有两个文件,一个.txt(文本文件)和一个.pdf(二进制文件)



7、多文件多文件夹



优化:

        1、消除重复计算,在insert函数中减少每次的运算次数,代码运行时间减小5%左右;

        2、改进读取模式,使用getline,虽然fread更快,但是对于某些样本不稳定,舍弃。代码稍微变快。

        3、改进传参效率,使用引用string变量,使得代码运行时间平均减小30%;

        4、删除word_list,采用即读即插的形式,使得代码运行时间平均减小10%;

附上最终代码:

#include <io.h>
#include <iostream>
#include <unordered_map>
#include <string>
#include <cctype>
#include <algorithm>
#include <fstream>
#include <time.h>
//#include <iomanip>
using namespace std;


class word_time {
public:
	string word;
	int time;
public:
	word_time() {
		this->word = "";
		this->time = 0;
	}
};

class word_word_time : public word_time {
public:
	string word_s;
	word_word_time() {
		this->time = 0;
		this->word = "";
		this->word_s = "";
	}
	void operator=(const word_word_time &another) {
		this->time = another.time;
		this->word = another.word;
		this->word_s = another.word_s;
	}
};

class bi_word {
public:
	string str1;
	string str2;
	bi_word() {
		this->str1 = "";
		this->str2 = "";
	}

	bi_word(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}

	bool operator==(const bi_word &another) {
		if (this->str1 == another.str1&&this->str2 == another.str2)
			return true;
		return false;
	}

	void operator=(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}
};

unordered_map<string, word_time> word_list;
unordered_map<string, word_word_time> bi_word_list;

/*
判断一个char是不是字母
参数类型: char
*/
bool is_letter(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122)
		return true;
	return false;
}
/*
判断一个char是不是分隔符
*/
bool is_fengefu(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57)
		return false;
	return true;
}

/*
添加一个字母到word_list中,并统计词数
参数类型: string
*/
void add_a_word(string word) {
	if (!is_letter(word[0]))
		return;                                                            //如果word[0]不是字母就return
	string word_ = word;
	string::iterator it;
	word_time word__time;
	it = word.end();
	it--;
	while (!is_letter(*it)) {
		it--;
	};																		//*it不是字母
	word.erase(it + 1, word.end());											//截取前面一部分
																			/*for (it = word.begin(); it - word.begin() < 4; it++) {
																			if (!is_letter(*it)) return;
																			}*/																		//如果it前四位不是纯字母,直接
	transform(word.begin(), word.end(), word.begin(), ::toupper);           //转换为大写
																			//word_time one = word_list[word];
	word_list[word].time++;													//把化简后的word塞入word_list并++次数
	if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) {
		word_list[word].word = word_;
	}															//如果word_比原来的小 就更新

}

/*
统计一行字符数
参数类型:string
*/
int count_char_sum(string str) {
	return(str.length());
}

/*
声明一下add_a_bi_word函数
*/
void add_a_bi_word(bi_word b_word);


/*
将一行的单词输入进word_list,并生成n-1个词组,并将这n-1个词组输入进bi_word_list(其实是个map)
*/
int sum = 0;
string str_temp, str_now;
void insert_into_wordlist(string &line) {
	vector<vector<string>> wordlist_of_a_line_vec;
	vector<bi_word> bi_wordlist_of_a_line;
	bi_word temp;
	//string::iterator it=line.begin(),it1=line.begin();
	int it_last = 0, it1;
	bool flag = false;

	line.append(" ");
	for (; is_fengefu(line[it_last]) && (size_t)it_last<line.length(); it_last++);
	for (int it = it_last; line[it] != '\0' && (size_t)it<line.length(); it++) {
		if (is_fengefu(line[it])) {

			for (it1 = it_last; it1 - it_last < 4 && (size_t)it1<line.length(); it1++) {
				if (!is_letter(line[it1])) {
					flag = true;
					break;
				}
			}//判断是否是单词 不是就丢掉
			if (flag == false) {//如果是单词
								/*if(wordlist_of_a_line_vec[0].size<40)
								wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));  //插入进单词列表
								else {
								wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last));
								}*/
				sum++;
				str_now = line.substr(it_last, it - it_last);
				add_a_word(str_now);
				if (str_temp != "") {
					temp.str1 = str_temp;
					temp.str2 = str_now;
					add_a_bi_word(temp);
				}
				str_temp = str_now;
			}
			/*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/
			//wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));

			flag = false;
			it_last = it + 1;
		}

	}

	//将单词列表一个个的map到map表中
	/*int ii = 0;
	for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) {

	add_a_word(*it1);//将单词一个一个的add进字典

	if (it1 < wordlist_of_a_line_vec[0].end() - 1) {
	bi_wordlist_of_a_line.push_back(temp);
	bi_wordlist_of_a_line[ii].str1 = *it1;
	bi_wordlist_of_a_line[ii].str2 = *(it1+1);
	ii++;
	}             //插入词组列表
	}                                       //将单词一个一个的add进字典
	for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) {
	add_a_bi_word(*it2);
	}*/
}


/*
将文件中出现频率最高的10个单词统计下来
返回一个vector<word_time>
*/
vector<word_time> the_most_ten() {
	vector<word_time> most_ten(10);
	unordered_map<string, word_time>::iterator it = word_list.begin();
	while (it != word_list.end()) {
		if (it->second.time > most_ten[9].time) {
			if (it->second.time > most_ten[0].time)
				most_ten.insert(most_ten.begin(), it->second);
			else
				for (int ii = 1; ii <= 9; ii++) {
					if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) {
						most_ten.insert(most_ten.begin() + ii, it->second);
						break;
					}
				}
			//if(it->second.time > most_ten[0].time)
			//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten.erase(most_ten.begin() + 10, most_ten.end());
	return most_ten;
}

/*
统计文件中的词组数,存入bi_word_list
*/

void add_a_bi_word(bi_word b_word) {
	if (!is_letter(b_word.str1[0]) || !is_letter(b_word.str2[0]))
		return;                                                            //如果word[0]不是字母就return
	bi_word b_word_ = b_word;
	string::iterator it1, it2;
	word_word_time word_word__time;
	it1 = b_word.str1.end();
	it2 = b_word.str2.end();
	it1--; it2--;
	while (!is_letter(*it1)) {
		it1--;
	};																		//*it不是字母
	while (!is_letter(*it2)) {
		it2--;
	};
	b_word.str1.erase(it1 + 1, b_word.str1.end());											//截取前面一部分
	b_word.str2.erase(it2 + 1, b_word.str2.end());
	/*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) {
	if (!is_letter(*it1)) return;
	}																		//如果it前四位不是纯字母,直接
	for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) {
	if (!is_letter(*it2)) return;
	}*/
	//transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper);           //转换为大写
	//transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper);
	for (string::iterator itfirst = b_word.str1.begin(); itfirst < b_word.str1.end(); itfirst++) {
		if (*itfirst >= 'a') *itfirst -= 32;
	}
	for (string::iterator itsecond = b_word.str2.begin(); itsecond < b_word.str2.end(); itsecond++) {
		if (*itsecond >= 'a') *itsecond -= 32;
	}
	string temp = b_word.str1 + b_word.str2;
	bi_word_list[temp].time++;													//把化简后的word塞入word_list并++次数
	if (bi_word_list[temp].word == "" || (bi_word_list[temp].word + bi_word_list[temp].word_s).compare(b_word_.str1 + b_word_.str2)>0) {
		bi_word_list[temp].word = b_word_.str1;
		bi_word_list[temp].word_s = b_word_.str2;
	}															//如果word_比原来的小 就更新

}


//"      hello fucking333 world hello fuck fuck abc fucking231 \n hello sd"



/*
将文件中出现频率最高的10个词组统计下来
返回一个vector<word_word_time>
*/
vector<word_word_time> the_most_ten_bi() {
	vector<word_word_time> most_ten_bi(10);
	word_word_time temp;
	unordered_map<string, word_word_time>::iterator it = bi_word_list.begin();
	while (it != bi_word_list.end()) {
		/*most_ten_bi[10] = it->second;
		for (int ii = 10; ii >= 1; ii--) {
		if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) {
		temp = most_ten_bi[ii];
		most_ten_bi[ii] = most_ten_bi[ii - 1];
		most_ten_bi[ii - 1] = temp;
		}
		}*/
		if (it->second.time > most_ten_bi[9].time) {
			if (it->second.time > most_ten_bi[0].time)
				most_ten_bi.insert(most_ten_bi.begin(), it->second);
			else
				for (int ii = 1; ii <= 9; ii++) {
					if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) {
						most_ten_bi.insert(most_ten_bi.begin() + ii, it->second);
						break;
					}
				}
			//if(it->second.time > most_ten[0].time)
			//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end());
	return most_ten_bi;
}

/*
深度优先搜索文件夹和子目录
*/
long sum1 = 0;
int line_sum = 0;
void DfsFolder(string path, int layer)
{
	_finddata_t file_info;
	string current_path = path + "/*.*"; //也可以用/*来匹配所有  
	int handle = _findfirst(current_path.c_str(), &file_info);
	//返回值为-1则查找失败  
	ifstream infile;
	string temp, text;
	if (-1 == handle)
	{
		cout << "cannot match the path" << endl;
		return;
	}

	do
	{
		//判断是否子目录  
		if (file_info.attrib == _A_SUBDIR)
		{
			//递归遍历子目录  
			//打印记号反映出深度层次  
			/*for (int i = 0; i<layer; i++)
			cout << "--";
			cout << file_info.name << endl;*/
			int layer_tmp = layer;
			if (strcmp(file_info.name, "..") != 0 && strcmp(file_info.name, ".") != 0)  //.是当前目录,..是上层目录,必须排除掉这两种情况  
				DfsFolder(path + '/' + file_info.name, layer_tmp + 1); //再windows下可以用\\转义分隔符,不推荐  
		}
		else
		{
			//打印记号反映出深度层次  
			/*for (int i = 0; i<layer; i++)
			cout << "--";
			cout << file_info.name << endl;*/
			infile.open(path + '/' + file_info.name, ios::in);
			//line_sum++;
			/*infile.seekg(0, ios::end);
			if (infile.get() == '\n')
				line_sum++;
			infile.seekg(0, ios::beg);*/
			while (getline(infile, temp)) {
				//text.append(temp);
				//cout << temp << endl;
				sum1 += temp.length();
				//if (temp.length()!=0)
				line_sum++;
				insert_into_wordlist(temp);
			}
			if (temp == "")line_sum++;
			//insert_into_wordlist(text);
			infile.close();
		}
	} while (!_findnext(handle, &file_info));  //返回0则遍历完  
											   //关闭文件句柄  
	_findclose(handle);
}

int main(int argc, char * argv[]) {
	//clock_t startTime, endTime;
	//startTime = clock();
	string path = argv[1];
	DfsFolder(path, 0);
	//DfsFolder("E:/Samples", 0);
	ofstream outfile;
	outfile.open("result.out", ios::out);
	//outfile.flags(ios::left);
	outfile << "char_number :" << sum1 << endl;
	outfile << "line_number :" << line_sum << endl;
	outfile << "word_number :" << sum << endl;
	outfile << endl;
	//outfile.open("result.out", ios::out);
	vector<word_word_time> a = the_most_ten_bi();
	outfile << "the top ten frequency of phrase :" << endl;
	for (int ii = 0; ii < 10; ii++)
		outfile << a[ii].word << ' ' << a[ii].word_s <<' '<< a[ii].time << endl;
	vector<word_time> b = the_most_ten();
	outfile << endl;
	outfile << "the top ten frequency of word :" << endl;
	for (int ii = 0; ii < 10; ii++)
		outfile << b[ii].word << b[ii].time << endl;
	outfile.close();
	//endTime = clock();
	//cout << "Totle Time : " << (double)(endTime - startTime) / CLOCKS_PER_SEC << "s" << endl;
	return 0;
}

/*
int main() {
//time_t start = clock();
DfsFolder("E:/Samples", 0);
//word_time test = word_list["THAT"];
//前十词组
vector<word_word_time> a=the_most_ten_bi();
cout << "bi_word_most" << endl;
for (int ii = 0; ii < 10; ii++)
cout << a[ii].word << ' ' << a[ii].word_s << ' ' << a[ii].time << endl;
//前十单词
vector<word_time> b = the_most_ten();
cout << "word_most" << endl;
for (int ii = 0; ii < 10; ii++)
cout << b[ii].word << ' ' << b[ii].time << endl;
//字符总数
cout << "char_sum" << endl;
cout << sum1 << endl;
//单词总数
cout << "word_sum" << endl;
cout << sum << endl;
//总行数
cout << "line_sum" << endl;
cout << line_sum << endl;
//time_t end = clock();
//cout << "xunxingshijian" << double(start - end) << endl;
system("pause");

//vector<word_word_time

}*/


/*
int main() {
bi_word a, b, c, d, e, f;
a.str1 = "hello"; a.str2 = "world";
b.str1 = "hello1"; b.str2 = "world";
c.str1 = "hello"; c.str2 = "world2";
d.str1 = "hello"; d.str2 = "fuck33";
e.str1 = "world"; e.str2 = "hello2";
f.str1 = "fucking"; f.str2 = "world";
add_a_bi_word(a);
add_a_bi_word(b);
add_a_bi_word(c);
add_a_bi_word(d);
add_a_bi_word(e);
add_a_bi_word(f);
}
*/








/*
int main() {
string h = "      hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd";
//string h = "    abc";
insert_into_wordlist(h);
vector<word_time> ten_word=the_most_ten();
vector<word_word_time>ten_bi_word = the_most_ten_bi();
system("pause");
return 0;
}
*/


/*
int main() {
//定义结构体,在查找时,该结构体中存储了查找到文件相应的属性
_finddata_t file;
//查找所有文件,如果查找失败,则返回-1;查找成功,返回相应的句柄
int k;
long HANDLE;
k = HANDLE = _findfirst("*.*", &file);
//根据相应的句柄,可以依次查找下一个文件;直到无法查询到新的文件为止
while (k != -1) {
//cout << file.name << endl;  操作函数放在这
k = _findnext(HANDLE, &file);
}
_findclose(HANDLE);
return 0;
}*/
/*
int main() {
string word = "StRt123546";
if (!(word[0] >= 65 && word[0] <= 90 || word[0] >= 97 && word[0] <= 122))
return 0;
string word_ = word;
string::iterator it=word.end();
it--;
while (!(*it >= 65 && *it <= 90 || *it >= 97 && *it <= 122)) {
it--;
};//*it不是字母
word.erase(it+1, word.end());
for (it = word.begin(); it - word.begin() < 4; it++) {
if (!is_letter(*it)) return 0;
}
transform(word.begin(), word.end(), word.begin(), ::toupper);
word_list[word].time++;													//把化简后的word塞入word_list并++次数
if (word_list[word].word == "" || word_list[word].word.compare(word_)) {
word_list[word].word = word_;
}															//如果word_比原来的小 就更新
cout << word << endl;
system("pause");
return 0;
}*/



现在的调用树热行:



发现还是insert函数过于吃内存,然后看到最底端


发现还是hash_map过于吃CPU了,但是这是数据结构本身的问题了,在调用数据结构的时候这个耗时已经决定了。可以说是优化的终点。

Linux下代码:

#include <dirent.h>
#include <sys/stat.h>
#include <iostream>
#include <unordered_map>
#include <string>
#include <cctype>
#include <algorithm>
#include <fstream>
#include<time.h>
#include <iomanip>
using namespace std;


class word_time {
public:
	string word;
	int time;
public:
	word_time(){
		this->word = "";
		this->time = 0;
	}
};

class word_word_time : public word_time {
public:
	string word_s;
	word_word_time() {
		this->time = 0;
		this->word = "";
		this->word_s = "";
	}
	void operator=(const word_word_time &another) {
		this->time = another.time;
		this->word = another.word;
		this->word_s = another.word_s;
	}
};

class bi_word {
public:
	string str1;
	string str2;
	bi_word() {
		this->str1 = "";
		this->str2 = "";
	}

	bi_word(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}

	bool operator==(const bi_word &another) {
		if (this->str1 == another.str1&&this->str2 == another.str2)
			return true;
		return false;
	}

	void operator=(const bi_word &another) {
		this->str1 = another.str1;
		this->str2 = another.str2;
	}
};

unordered_map<string, word_time> word_list;
unordered_map<string, word_word_time> bi_word_list;

/*
�ж�һ��char�Dz�����ĸ
��������: char
*/
bool is_letter(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122)
		return true;
	return false;
}
/*
�ж�һ��char�Dz��Ƿָ��
*/
bool is_fengefu(char m) {
	if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57)
		return false;
	return true;
}

/*
���һ����ĸ��word_list�У���ͳ�ƴ���
�������ͣ� string
*/
void add_a_word(string word) {
	if (!is_letter(word[0]))
		return;                                                            //���word[0]������ĸ��return
	string word_ = word;
	string::iterator it;
	word_time word__time;
	it = word.end();
	it--;
	while (!is_letter(*it)) {
		it--;
	};																		//*it������ĸ
	word.erase(it+1, word.end());											//��ȡǰ��һ����
	/*for (it = word.begin(); it - word.begin() < 4; it++) {
		if (!is_letter(*it)) return;
	}*/																		//���itǰ��λ���Ǵ���ĸ��ֱ��
	transform(word.begin(), word.end(), word.begin(), ::toupper);           //ת��Ϊ��д
	//word_time one = word_list[word];
	word_list[word].time++;													//�ѻ�����word����word_list��++����
	if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) {
		word_list[word].word = word_;
	}															//���word_��ԭ����С �͸���

}

/*
ͳ��һ���ַ���
��������:string
*/
int count_char_sum(string str) {
	return(str.length());
}

/*
����һ��add_a_bi_word����
*/
void add_a_bi_word(bi_word b_word);


/*
��һ�еĵ��������word_list,������n-1������,������n-1�����������bi_word_list����ʵ�Ǹ�map��
*/
int sum=0;
string str_temp,str_now;
void insert_into_wordlist(string &line) {
	vector<vector<string>> wordlist_of_a_line_vec;
	vector<bi_word> bi_wordlist_of_a_line;
	bi_word temp;
	//string::iterator it=line.begin(),it1=line.begin();
	int it_last=0,it1;
	bool flag=false;

	line.append(" ");
	for (; is_fengefu(line[it_last])&&(size_t)it_last<line.length(); it_last++);
	for (int it=it_last; line[it]!= '\0'&& (size_t)it<line.length(); it++) {
		if (is_fengefu(line[it])) {

			for (it1 = it_last; it1 - it_last < 4 &&(size_t)it1<line.length(); it1++) {
				if (!is_letter(line[it1])) {
					flag = true;
					break;
				}
			}//�ж��Ƿ��ǵ��� ���ǾͶ���
			if (flag == false) {//����ǵ���
				/*if(wordlist_of_a_line_vec[0].size<40)
					wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));  //����������б�
				else {
					wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last));
				}*/
				sum++;
				str_now = line.substr(it_last, it - it_last);
				add_a_word(str_now);
				if (str_temp != "") {
					temp.str1 = str_temp;
					temp.str2 = str_now;
					add_a_bi_word(temp);
				}
				str_temp = str_now;
			}
				/*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/
					//wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last));

			flag = false;
			it_last = it + 1;
		}

	}

	//�������б�һ������map��map����
	/*int ii = 0;
	for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) {

		add_a_word(*it1);//������һ��һ����add���ֵ�

		if (it1 < wordlist_of_a_line_vec[0].end() - 1) {
			bi_wordlist_of_a_line.push_back(temp);
			bi_wordlist_of_a_line[ii].str1 = *it1;
			bi_wordlist_of_a_line[ii].str2 = *(it1+1);
			ii++;
		}             //��������б�
	}                                       //������һ��һ����add���ֵ�
	for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) {
		add_a_bi_word(*it2);
	}*/
}


/*
���ļ��г���Ƶ����ߵ�10������ͳ������
����һ��vector<word_time>
*/
vector<word_time> the_most_ten() {
	vector<word_time> most_ten(10);
	unordered_map<string, word_time>::iterator it = word_list.begin();
	while (it != word_list.end()) {
		if (it->second.time > most_ten[9].time) {
			if (it->second.time > most_ten[0].time)
				most_ten.insert(most_ten.begin(), it->second);
			else
			for (int ii = 1; ii<=9; ii++) {
				if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) {
					most_ten.insert(most_ten.begin() + ii, it->second);
					break;
				}
			}
			//if(it->second.time > most_ten[0].time)
				//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten.erase(most_ten.begin() + 10, most_ten.end());
	return most_ten;
}

/*
ͳ���ļ��еĴ�����,����bi_word_list
*/

void add_a_bi_word(bi_word b_word) {
	if (!is_letter(b_word.str1[0])|| !is_letter(b_word.str2[0]))
		return;                                                            //���word[0]������ĸ��return
	bi_word b_word_ = b_word;
	string::iterator it1,it2;
	word_word_time word_word__time;
	it1 = b_word.str1.end();
	it2 = b_word.str2.end();
	it1--; it2--;
	while (!is_letter(*it1)) {
		it1--;
	};																		//*it������ĸ
	while (!is_letter(*it2)) {
		it2--;
	};
	b_word.str1.erase(it1 + 1, b_word.str1.end());											//��ȡǰ��һ����
	b_word.str2.erase(it2 + 1, b_word.str2.end());
	/*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) {
		if (!is_letter(*it1)) return;
	}																		//���itǰ��λ���Ǵ���ĸ��ֱ��
	for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) {
		if (!is_letter(*it2)) return;
	}*/
	//transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper);           //ת��Ϊ��д
	//transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper);
	for (string::iterator itfirst = b_word.str1.begin(); itfirst < b_word.str1.end(); itfirst++) {
		if (*itfirst >= 'a') *itfirst -= 32;
	}
	for (string::iterator itsecond = b_word.str2.begin(); itsecond < b_word.str2.end(); itsecond++) {
		if (*itsecond >= 'a') *itsecond -= 32;
	}
	string temp = b_word.str1 + b_word.str2;
	bi_word_list[temp].time++;													//�ѻ�����word����word_list��++����
	if (bi_word_list[temp].word == "" || (bi_word_list[temp].word+ bi_word_list[temp].word_s).compare(b_word_.str1+b_word_.str2)>0) {
		bi_word_list[temp].word = b_word_.str1;
		bi_word_list[temp].word_s = b_word_.str2;
	}															//���word_��ԭ����С �͸���

}


//"      hello fucking333 world hello fuck fuck abc fucking231 \n hello sd"



/*
���ļ��г���Ƶ����ߵ�10������ͳ������
����һ��vector<word_word_time>
*/
vector<word_word_time> the_most_ten_bi() {
	vector<word_word_time> most_ten_bi(10);
	word_word_time temp;
	unordered_map<string, word_word_time>::iterator it = bi_word_list.begin();
	while (it != bi_word_list.end()) {
		/*most_ten_bi[10] = it->second;
		for (int ii = 10; ii >= 1; ii--) {
			if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) {
				temp = most_ten_bi[ii];
				most_ten_bi[ii] = most_ten_bi[ii - 1];
				most_ten_bi[ii - 1] = temp;
			}
		}*/
		if (it->second.time > most_ten_bi[9].time) {
			if (it->second.time > most_ten_bi[0].time)
				most_ten_bi.insert(most_ten_bi.begin(), it->second);
			else
				for (int ii = 1; ii <= 9; ii++) {
					if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) {
						most_ten_bi.insert(most_ten_bi.begin() + ii, it->second);
						break;
					}
				}
			//if(it->second.time > most_ten[0].time)
			//most_ten.insert(most_ten.begin(), it->second);
		}
		it++;
	}
	most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end());
	return most_ten_bi;
}

/*
������������ļ��к���Ŀ¼
*/
long sum1 = 0;
int line_sum = 0;
void DfsFolder(string lname)
{
DIR *dir_ptr;
    struct stat infobuf;
    struct dirent *direntp;
    string name, temp;
    ifstream infile;
    string  text;
    if ((dir_ptr = opendir(lname.c_str())) == NULL)
        perror("can not open");
    else
    {
        while ((direntp = readdir(dir_ptr)) != NULL)
        {
            temp = "";
            name = direntp->d_name;
            if (name == "." || name==".." )
            {
                ;
            }
            else
            {
                temp+=lname;
                temp+="/";
                temp+=name;

                //strcat(temp, lname);
                //strcat(temp, "/");
                //strcat(temp, name);
                if ((stat(temp.c_str(), &infobuf)) == -1)
                    printf("#########\n");
                if ((infobuf.st_mode & 0170000) == 0040000)
                {
                    //printf("%s",name);
                    //printf("  this is a directory\n");
                    DfsFolder(temp);
                }
                else
                {
                    //printf("%s",name);
                    //printf("  this is a file\n");
                    infile.open(temp, ios::in);
                    //line_sum++;
                    while (getline(infile, text)) {
                        //text.append(temp);
                        //cout << temp << endl;
                        sum1 += text.length();
                        line_sum++;
                        insert_into_wordlist(text);
                    }
                    if(temp == "") line_sum++;
                    //insert_into_wordlist(text);
                    infile.close();
                }
            }
        }
    }
    closedir(dir_ptr);
}

int main(int argc, char * argv[]) {
    string path=argv[1];
	DfsFolder(path);
    ofstream outfile;
    outfile.open("result.out", ios::out);
    //outfile.flags(ios::left);
	outfile << "char_number :" << sum1 << endl;
    outfile << "line_number :" << line_sum << endl;
	outfile << "word_number :" << sum << endl;
    outfile <<  endl;
    //outfile.open("result.out", ios::out);
    vector<word_word_time> a=the_most_ten_bi();
    outfile << "the top ten frequency of phrase :" << endl;
    for (int ii = 0; ii < 10; ii++)
        outfile <<   a[ii].word << ' ' << a[ii].word_s <<setw(10) << a[ii].time << endl;
    vector<word_time> b = the_most_ten();
    outfile << endl;
    outfile << "the top ten frequency of word :" << endl;
    for (int ii = 0; ii < 10; ii++)
        outfile  <<  b[ii].word << setw(10)  << b[ii].time << endl;
    outfile.close();
}


/*
int main() {
	bi_word a, b, c, d, e, f;
	a.str1 = "hello"; a.str2 = "world";
	b.str1 = "hello1"; b.str2 = "world";
	c.str1 = "hello"; c.str2 = "world2";
	d.str1 = "hello"; d.str2 = "fuck33";
	e.str1 = "world"; e.str2 = "hello2";
	f.str1 = "fucking"; f.str2 = "world";
	add_a_bi_word(a);
	add_a_bi_word(b);
	add_a_bi_word(c);
	add_a_bi_word(d);
	add_a_bi_word(e);
	add_a_bi_word(f);
}
*/








/*
int main() {
	string h = "      hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd";
	//string h = "    abc";
	insert_into_wordlist(h);
	vector<word_time> ten_word=the_most_ten();
	vector<word_word_time>ten_bi_word = the_most_ten_bi();
	system("pause");
	return 0;
}
*/


/*
int main() {
	//����ṹ�壬�ڲ���ʱ���ýṹ���д洢�˲��ҵ��ļ���Ӧ������
	_finddata_t file;
	//���������ļ����������ʧ�ܣ��

用grof进行代码分析:


可以看到跑了之后还是 在unordered_map的Hash函数占用了很多CPU 占用很多内存


在调用树的层面上,可以看到遍历、插入unmap的过程非常耗时,但是由于stl的局限性,也有同学试着自己写hash,发现效果还没有stl好。


总结:

这个项目主要是通过unmap模拟字典,当然其实这个项目用python实现效果会更好。所有的单词化简后存入unmap,这样豁免的统计会很方便。


猜你喜欢

转载自blog.csdn.net/z1991998920/article/details/79689179