前言:
软工第一次作业是实现对文件夹中文件的词频统计,具体要求在博客中。
题目:软工第一次作业
需求:
1. 统计文件的字符数
2. 统计文件的单词总数
3. 统计文件的总行数
4. 统计文件中各单词的出现次数
5. 对给定文件夹及其递归子文件夹下的所有文件进行统计
6. 统计两个单词(词组)在一起的频率,输出频率最高的前10个
PSP:
PSP | Personal Software Process Stages | Time /h | |
---|---|---|---|
Planning | 计划 | 20 | |
Estimate | 估计这个任务需要多少时间 | 20 | |
Development | 开发 | 52 | |
Analysis | 需求分析 (包括学习新技术) | 2 | |
Design Spec | 生成设计文档 | 1 | |
Design | 具体设计 | 4 | |
Coding | 具体编码 | 13 | |
Code Review | 代码复审 | 2 | |
Test | 测试(自我测试,修改代码,提交修改) | 30 | |
Reporting | 报告 | 10 | |
Test Report | 测试报告 | 6 | |
Size Measurement | 计算工作量 | 2 | |
Postmortem & Process Improvement Plan | 事后总结, 并提出过程改进计划 | 2 | |
思路:
一开始准备写个字典树,字典树的搜索效率是O(n)。转念一想,为什么不用Hash呢,Hash的查找只需要O(1),于是准备一鼓作气写个hash。Google如何写一个perfect的hash的时候,某位大佬提到了unordered_map。(在这感谢这位不愿意透露姓名的赵r大佬)。
然后Google Unordered_map 看到最全面的一篇blog就是这个,博主讲的很详细。
又恶补了一通map的用法,详情戳这里。
好了开始动工~
class word_time { public: string word; //最后输出的按照字典顺序的字母 int time; //单词出现的次数 public: word_time(){ //构造函数,init this->word = ""; this->time = 0; } };
unordered_map<string, word_time> word_list; //从“一个单词”到“单词最简形式和出现次数构成的类”的一个字典
主要思想就是这样了,废话不多说,直接上代码~
代码:
#include <io.h> #include <iostream> #include <unordered_map> #include <string> #include <cctype> #include <algorithm> #include <fstream> using namespace std; class word_time { public: string word; int time; public: word_time(){ this->word = ""; this->time = 0; } }; class word_word_time : public word_time { public: string word_s; word_word_time() { this->time = 0; this->word = ""; this->word_s = ""; } void operator=(const word_word_time &another) { this->time = another.time; this->word = another.word; this->word_s = another.word_s; } }; class bi_word { public: string str1; string str2; bi_word() { this->str1 = ""; this->str2 = ""; } bi_word(const bi_word &another) { this->str1 = another.str1; this->str2 = another.str2; } bool operator==(const bi_word &another) { if (this->str1 == another.str1&&this->str2 == another.str2) return true; return false; } void operator=(const bi_word &another) { this->str1 = another.str1; this->str2 = another.str2; } }; unordered_map<string, word_time> word_list; unordered_map<string, word_word_time> bi_word_list; /* 判断一个char是不是字母 参数类型: char */ bool is_letter(char m) { if (m >= 65 && m <= 90 || m >= 97 && m <= 122) return true; return false; } /* 判断一个char是不是分隔符 */ bool is_fengefu(char m) { if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57) return false; return true; } /* 添加一个字母到word_list中,并统计词数 参数类型: string */ void add_a_word(string word) { if (!is_letter(word[0])) return; //如果word[0]不是字母就return string word_ = word; string::iterator it; word_time word__time; it = word.end(); it--; while (!is_letter(*it)) { it--; }; //*it不是字母 word.erase(it+1, word.end()); //截取前面一部分 /*for (it = word.begin(); it - word.begin() < 4; it++) { if (!is_letter(*it)) return; }*/ //如果it前四位不是纯字母,直接 transform(word.begin(), word.end(), word.begin(), ::toupper); //转换为大写 //word_time one = word_list[word]; word_list[word].time++; //把化简后的word塞入word_list并++次数 if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) { word_list[word].word = word_; } //如果word_比原来的小 就更新 } /* 统计一行字符数 参数类型:string */ int count_char_sum(string str) { return(str.length()); } /* 声明一下add_a_bi_word函数 */ void add_a_bi_word(bi_word b_word); /* 将一行的单词输入进word_list,并生成n-1个词组,并将这n-1个词组输入进bi_word_list(其实是个map) */ int sum=0; string str_temp,str_now; void insert_into_wordlist(string &line) { vector<vector<string>> wordlist_of_a_line_vec; vector<bi_word> bi_wordlist_of_a_line; bi_word temp; //string::iterator it=line.begin(),it1=line.begin(); int it_last=0,it1; bool flag=false; line.append(" "); for (; is_fengefu(line[it_last])&&(size_t)it_last<line.length(); it_last++); for (int it=it_last; line[it]!= '\0'&& (size_t)it<line.length(); it++) { if (is_fengefu(line[it])) { for (it1 = it_last; it1 - it_last < 4 &&(size_t)it1<line.length(); it1++) { if (!is_letter(line[it1])) { flag = true; break; } }//判断是否是单词 不是就丢掉 if (flag == false) {//如果是单词 /*if(wordlist_of_a_line_vec[0].size<40) wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last)); //插入进单词列表 else { wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last)); }*/ sum++; str_now = line.substr(it_last, it - it_last); add_a_word(str_now); if (str_temp != "") { temp.str1 = str_temp; temp.str2 = str_now; add_a_bi_word(temp); } str_temp = str_now; } /*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/ //wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last)); flag = false; it_last = it + 1; } } //将单词列表一个个的map到map表中 /*int ii = 0; for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) { add_a_word(*it1);//将单词一个一个的add进字典 if (it1 < wordlist_of_a_line_vec[0].end() - 1) { bi_wordlist_of_a_line.push_back(temp); bi_wordlist_of_a_line[ii].str1 = *it1; bi_wordlist_of_a_line[ii].str2 = *(it1+1); ii++; } //插入词组列表 } //将单词一个一个的add进字典 for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) { add_a_bi_word(*it2); }*/ } /* 将文件中出现频率最高的10个单词统计下来 返回一个vector<word_time> */ vector<word_time> the_most_ten() { vector<word_time> most_ten(10); unordered_map<string, word_time>::iterator it = word_list.begin(); while (it != word_list.end()) { if (it->second.time > most_ten[9].time) { if (it->second.time > most_ten[0].time) most_ten.insert(most_ten.begin(), it->second); else for (int ii = 1; ii<=9; ii++) { if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) { most_ten.insert(most_ten.begin() + ii, it->second); break; } } //if(it->second.time > most_ten[0].time) //most_ten.insert(most_ten.begin(), it->second); } it++; } most_ten.erase(most_ten.begin() + 10, most_ten.end()); return most_ten; } /* 统计文件中的词组数,存入bi_word_list */ void add_a_bi_word(bi_word b_word) { if (!is_letter(b_word.str1[0])|| !is_letter(b_word.str2[0])) return; //如果word[0]不是字母就return bi_word b_word_ = b_word; string::iterator it1,it2; word_word_time word_word__time; it1 = b_word.str1.end(); it2 = b_word.str2.end(); it1--; it2--; while (!is_letter(*it1)) { it1--; }; //*it不是字母 while (!is_letter(*it2)) { it2--; }; b_word.str1.erase(it1 + 1, b_word.str1.end()); //截取前面一部分 b_word.str2.erase(it2 + 1, b_word.str2.end()); /*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) { if (!is_letter(*it1)) return; } //如果it前四位不是纯字母,直接 for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) { if (!is_letter(*it2)) return; }*/ transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper); //转换为大写 transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper); string temp = b_word.str1 + b_word.str2; bi_word_list[temp].time++; //把化简后的word塞入word_list并++次数 if (bi_word_list[temp].word == "" || (bi_word_list[temp].word+ bi_word_list[temp].word_s).compare(b_word_.str1+b_word_.str2)>0) { bi_word_list[temp].word = b_word_.str1; bi_word_list[temp].word_s = b_word_.str2; } //如果word_比原来的小 就更新 } //" hello fucking333 world hello fuck fuck abc fucking231 \n hello sd" /* 将文件中出现频率最高的10个词组统计下来 返回一个vector<word_word_time> */ vector<word_word_time> the_most_ten_bi() { vector<word_word_time> most_ten_bi(10); word_word_time temp; unordered_map<string, word_word_time>::iterator it = bi_word_list.begin(); while (it != bi_word_list.end()) { /*most_ten_bi[10] = it->second; for (int ii = 10; ii >= 1; ii--) { if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) { temp = most_ten_bi[ii]; most_ten_bi[ii] = most_ten_bi[ii - 1]; most_ten_bi[ii - 1] = temp; } }*/ if (it->second.time > most_ten_bi[9].time) { if (it->second.time > most_ten_bi[0].time) most_ten_bi.insert(most_ten_bi.begin(), it->second); else for (int ii = 1; ii <= 9; ii++) { if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) { most_ten_bi.insert(most_ten_bi.begin() + ii, it->second); break; } } //if(it->second.time > most_ten[0].time) //most_ten.insert(most_ten.begin(), it->second); } it++; } most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end()); return most_ten_bi; } /* 深度优先搜索文件夹和子目录 */ long sum1 = 0; int line_sum = 0; void DfsFolder(string path, int layer) { _finddata_t file_info; string current_path = path + "/*.*"; //也可以用/*来匹配所有 int handle = _findfirst(current_path.c_str(), &file_info); //返回值为-1则查找失败 ifstream infile; string temp, text; if (-1 == handle) { cout << "cannot match the path" << endl; return; } do { //判断是否子目录 if (file_info.attrib == _A_SUBDIR) { //递归遍历子目录 //打印记号反映出深度层次 /*for (int i = 0; i<layer; i++) cout << "--"; cout << file_info.name << endl;*/ int layer_tmp = layer; if (strcmp(file_info.name, "..") != 0 && strcmp(file_info.name, ".") != 0) //.是当前目录,..是上层目录,必须排除掉这两种情况 DfsFolder(path + '/' + file_info.name, layer_tmp + 1); //再windows下可以用\\转义分隔符,不推荐 } else { //打印记号反映出深度层次 /*for (int i = 0; i<layer; i++) cout << "--"; cout << file_info.name << endl;*/ infile.open(path + '/' + file_info.name, ios::in); while (getline(infile, temp)) { //text.append(temp); //cout << temp << endl; sum1 += temp.length(); line_sum++; insert_into_wordlist(temp); } //insert_into_wordlist(text); infile.close(); } } while (!_findnext(handle, &file_info)); //返回0则遍历完 //关闭文件句柄 _findclose(handle); } int main() { DfsFolder("E:/tales", 0); word_time test = word_list["THAT"]; //前十词组 vector<word_word_time> a=the_most_ten_bi(); //前十单词 vector<word_time> b = the_most_ten(); //字符总数 sum1; //单词总数 sum; //总行数 line_sum; //vector<word_word_time } /* int main() { bi_word a, b, c, d, e, f; a.str1 = "hello"; a.str2 = "world"; b.str1 = "hello1"; b.str2 = "world"; c.str1 = "hello"; c.str2 = "world2"; d.str1 = "hello"; d.str2 = "fuck33"; e.str1 = "world"; e.str2 = "hello2"; f.str1 = "fucking"; f.str2 = "world"; add_a_bi_word(a); add_a_bi_word(b); add_a_bi_word(c); add_a_bi_word(d); add_a_bi_word(e); add_a_bi_word(f); } */ /* int main() { string h = " hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd"; //string h = " abc"; insert_into_wordlist(h); vector<word_time> ten_word=the_most_ten(); vector<word_word_time>ten_bi_word = the_most_ten_bi(); system("pause"); return 0; } */ /* int main() { //定义结构体,在查找时,该结构体中存储了查找到文件相应的属性 _finddata_t file; //查找所有文件,如果查找失败,则返回-1;查找成功,返回相应的句柄 int k; long HANDLE; k = HANDLE = _findfirst("*.*", &file); //根据相应的句柄,可以依次查找下一个文件;直到无法查询到新的文件为止 while (k != -1) { //cout << file.name << endl; 操作函数放在这 k = _findnext(HANDLE, &file); } _findclose(HANDLE); return 0; }*/ /* int main() { string word = "StRt123546"; if (!(word[0] >= 65 && word[0] <= 90 || word[0] >= 97 && word[0] <= 122)) return 0; string word_ = word; string::iterator it=word.end(); it--; while (!(*it >= 65 && *it <= 90 || *it >= 97 && *it <= 122)) { it--; };//*it不是字母 word.erase(it+1, word.end()); for (it = word.begin(); it - word.begin() < 4; it++) { if (!is_letter(*it)) return 0; } transform(word.begin(), word.end(), word.begin(), ::toupper); word_list[word].time++; //把化简后的word塞入word_list并++次数 if (word_list[word].word == "" || word_list[word].word.compare(word_)) { word_list[word].word = word_; } //如果word_比原来的小 就更新 cout << word << endl; system("pause"); return 0; }*/
测试样例在main()的注释中,这是对每个单元的测试;
总测试集
1、空集
2、只有一个文本文件,且没有内容
3、只有一个文本文件,只有一个空格
4、只有一个文本文件,有一个空格和一个换行符
5、只有一个文本文件(极简情况)
6、有两个文件,一个.txt(文本文件)和一个.pdf(二进制文件)
7、多文件多文件夹
优化:
1、消除重复计算,在insert函数中减少每次的运算次数,代码运行时间减小5%左右;
2、改进读取模式,使用getline,虽然fread更快,但是对于某些样本不稳定,舍弃。代码稍微变快。
3、改进传参效率,使用引用string变量,使得代码运行时间平均减小30%;
4、删除word_list,采用即读即插的形式,使得代码运行时间平均减小10%;
附上最终代码:
#include <io.h> #include <iostream> #include <unordered_map> #include <string> #include <cctype> #include <algorithm> #include <fstream> #include <time.h> //#include <iomanip> using namespace std; class word_time { public: string word; int time; public: word_time() { this->word = ""; this->time = 0; } }; class word_word_time : public word_time { public: string word_s; word_word_time() { this->time = 0; this->word = ""; this->word_s = ""; } void operator=(const word_word_time &another) { this->time = another.time; this->word = another.word; this->word_s = another.word_s; } }; class bi_word { public: string str1; string str2; bi_word() { this->str1 = ""; this->str2 = ""; } bi_word(const bi_word &another) { this->str1 = another.str1; this->str2 = another.str2; } bool operator==(const bi_word &another) { if (this->str1 == another.str1&&this->str2 == another.str2) return true; return false; } void operator=(const bi_word &another) { this->str1 = another.str1; this->str2 = another.str2; } }; unordered_map<string, word_time> word_list; unordered_map<string, word_word_time> bi_word_list; /* 判断一个char是不是字母 参数类型: char */ bool is_letter(char m) { if (m >= 65 && m <= 90 || m >= 97 && m <= 122) return true; return false; } /* 判断一个char是不是分隔符 */ bool is_fengefu(char m) { if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57) return false; return true; } /* 添加一个字母到word_list中,并统计词数 参数类型: string */ void add_a_word(string word) { if (!is_letter(word[0])) return; //如果word[0]不是字母就return string word_ = word; string::iterator it; word_time word__time; it = word.end(); it--; while (!is_letter(*it)) { it--; }; //*it不是字母 word.erase(it + 1, word.end()); //截取前面一部分 /*for (it = word.begin(); it - word.begin() < 4; it++) { if (!is_letter(*it)) return; }*/ //如果it前四位不是纯字母,直接 transform(word.begin(), word.end(), word.begin(), ::toupper); //转换为大写 //word_time one = word_list[word]; word_list[word].time++; //把化简后的word塞入word_list并++次数 if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) { word_list[word].word = word_; } //如果word_比原来的小 就更新 } /* 统计一行字符数 参数类型:string */ int count_char_sum(string str) { return(str.length()); } /* 声明一下add_a_bi_word函数 */ void add_a_bi_word(bi_word b_word); /* 将一行的单词输入进word_list,并生成n-1个词组,并将这n-1个词组输入进bi_word_list(其实是个map) */ int sum = 0; string str_temp, str_now; void insert_into_wordlist(string &line) { vector<vector<string>> wordlist_of_a_line_vec; vector<bi_word> bi_wordlist_of_a_line; bi_word temp; //string::iterator it=line.begin(),it1=line.begin(); int it_last = 0, it1; bool flag = false; line.append(" "); for (; is_fengefu(line[it_last]) && (size_t)it_last<line.length(); it_last++); for (int it = it_last; line[it] != '\0' && (size_t)it<line.length(); it++) { if (is_fengefu(line[it])) { for (it1 = it_last; it1 - it_last < 4 && (size_t)it1<line.length(); it1++) { if (!is_letter(line[it1])) { flag = true; break; } }//判断是否是单词 不是就丢掉 if (flag == false) {//如果是单词 /*if(wordlist_of_a_line_vec[0].size<40) wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last)); //插入进单词列表 else { wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last)); }*/ sum++; str_now = line.substr(it_last, it - it_last); add_a_word(str_now); if (str_temp != "") { temp.str1 = str_temp; temp.str2 = str_now; add_a_bi_word(temp); } str_temp = str_now; } /*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/ //wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last)); flag = false; it_last = it + 1; } } //将单词列表一个个的map到map表中 /*int ii = 0; for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) { add_a_word(*it1);//将单词一个一个的add进字典 if (it1 < wordlist_of_a_line_vec[0].end() - 1) { bi_wordlist_of_a_line.push_back(temp); bi_wordlist_of_a_line[ii].str1 = *it1; bi_wordlist_of_a_line[ii].str2 = *(it1+1); ii++; } //插入词组列表 } //将单词一个一个的add进字典 for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) { add_a_bi_word(*it2); }*/ } /* 将文件中出现频率最高的10个单词统计下来 返回一个vector<word_time> */ vector<word_time> the_most_ten() { vector<word_time> most_ten(10); unordered_map<string, word_time>::iterator it = word_list.begin(); while (it != word_list.end()) { if (it->second.time > most_ten[9].time) { if (it->second.time > most_ten[0].time) most_ten.insert(most_ten.begin(), it->second); else for (int ii = 1; ii <= 9; ii++) { if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) { most_ten.insert(most_ten.begin() + ii, it->second); break; } } //if(it->second.time > most_ten[0].time) //most_ten.insert(most_ten.begin(), it->second); } it++; } most_ten.erase(most_ten.begin() + 10, most_ten.end()); return most_ten; } /* 统计文件中的词组数,存入bi_word_list */ void add_a_bi_word(bi_word b_word) { if (!is_letter(b_word.str1[0]) || !is_letter(b_word.str2[0])) return; //如果word[0]不是字母就return bi_word b_word_ = b_word; string::iterator it1, it2; word_word_time word_word__time; it1 = b_word.str1.end(); it2 = b_word.str2.end(); it1--; it2--; while (!is_letter(*it1)) { it1--; }; //*it不是字母 while (!is_letter(*it2)) { it2--; }; b_word.str1.erase(it1 + 1, b_word.str1.end()); //截取前面一部分 b_word.str2.erase(it2 + 1, b_word.str2.end()); /*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) { if (!is_letter(*it1)) return; } //如果it前四位不是纯字母,直接 for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) { if (!is_letter(*it2)) return; }*/ //transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper); //转换为大写 //transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper); for (string::iterator itfirst = b_word.str1.begin(); itfirst < b_word.str1.end(); itfirst++) { if (*itfirst >= 'a') *itfirst -= 32; } for (string::iterator itsecond = b_word.str2.begin(); itsecond < b_word.str2.end(); itsecond++) { if (*itsecond >= 'a') *itsecond -= 32; } string temp = b_word.str1 + b_word.str2; bi_word_list[temp].time++; //把化简后的word塞入word_list并++次数 if (bi_word_list[temp].word == "" || (bi_word_list[temp].word + bi_word_list[temp].word_s).compare(b_word_.str1 + b_word_.str2)>0) { bi_word_list[temp].word = b_word_.str1; bi_word_list[temp].word_s = b_word_.str2; } //如果word_比原来的小 就更新 } //" hello fucking333 world hello fuck fuck abc fucking231 \n hello sd" /* 将文件中出现频率最高的10个词组统计下来 返回一个vector<word_word_time> */ vector<word_word_time> the_most_ten_bi() { vector<word_word_time> most_ten_bi(10); word_word_time temp; unordered_map<string, word_word_time>::iterator it = bi_word_list.begin(); while (it != bi_word_list.end()) { /*most_ten_bi[10] = it->second; for (int ii = 10; ii >= 1; ii--) { if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) { temp = most_ten_bi[ii]; most_ten_bi[ii] = most_ten_bi[ii - 1]; most_ten_bi[ii - 1] = temp; } }*/ if (it->second.time > most_ten_bi[9].time) { if (it->second.time > most_ten_bi[0].time) most_ten_bi.insert(most_ten_bi.begin(), it->second); else for (int ii = 1; ii <= 9; ii++) { if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) { most_ten_bi.insert(most_ten_bi.begin() + ii, it->second); break; } } //if(it->second.time > most_ten[0].time) //most_ten.insert(most_ten.begin(), it->second); } it++; } most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end()); return most_ten_bi; } /* 深度优先搜索文件夹和子目录 */ long sum1 = 0; int line_sum = 0; void DfsFolder(string path, int layer) { _finddata_t file_info; string current_path = path + "/*.*"; //也可以用/*来匹配所有 int handle = _findfirst(current_path.c_str(), &file_info); //返回值为-1则查找失败 ifstream infile; string temp, text; if (-1 == handle) { cout << "cannot match the path" << endl; return; } do { //判断是否子目录 if (file_info.attrib == _A_SUBDIR) { //递归遍历子目录 //打印记号反映出深度层次 /*for (int i = 0; i<layer; i++) cout << "--"; cout << file_info.name << endl;*/ int layer_tmp = layer; if (strcmp(file_info.name, "..") != 0 && strcmp(file_info.name, ".") != 0) //.是当前目录,..是上层目录,必须排除掉这两种情况 DfsFolder(path + '/' + file_info.name, layer_tmp + 1); //再windows下可以用\\转义分隔符,不推荐 } else { //打印记号反映出深度层次 /*for (int i = 0; i<layer; i++) cout << "--"; cout << file_info.name << endl;*/ infile.open(path + '/' + file_info.name, ios::in); //line_sum++; /*infile.seekg(0, ios::end); if (infile.get() == '\n') line_sum++; infile.seekg(0, ios::beg);*/ while (getline(infile, temp)) { //text.append(temp); //cout << temp << endl; sum1 += temp.length(); //if (temp.length()!=0) line_sum++; insert_into_wordlist(temp); } if (temp == "")line_sum++; //insert_into_wordlist(text); infile.close(); } } while (!_findnext(handle, &file_info)); //返回0则遍历完 //关闭文件句柄 _findclose(handle); } int main(int argc, char * argv[]) { //clock_t startTime, endTime; //startTime = clock(); string path = argv[1]; DfsFolder(path, 0); //DfsFolder("E:/Samples", 0); ofstream outfile; outfile.open("result.out", ios::out); //outfile.flags(ios::left); outfile << "char_number :" << sum1 << endl; outfile << "line_number :" << line_sum << endl; outfile << "word_number :" << sum << endl; outfile << endl; //outfile.open("result.out", ios::out); vector<word_word_time> a = the_most_ten_bi(); outfile << "the top ten frequency of phrase :" << endl; for (int ii = 0; ii < 10; ii++) outfile << a[ii].word << ' ' << a[ii].word_s <<' '<< a[ii].time << endl; vector<word_time> b = the_most_ten(); outfile << endl; outfile << "the top ten frequency of word :" << endl; for (int ii = 0; ii < 10; ii++) outfile << b[ii].word << b[ii].time << endl; outfile.close(); //endTime = clock(); //cout << "Totle Time : " << (double)(endTime - startTime) / CLOCKS_PER_SEC << "s" << endl; return 0; } /* int main() { //time_t start = clock(); DfsFolder("E:/Samples", 0); //word_time test = word_list["THAT"]; //前十词组 vector<word_word_time> a=the_most_ten_bi(); cout << "bi_word_most" << endl; for (int ii = 0; ii < 10; ii++) cout << a[ii].word << ' ' << a[ii].word_s << ' ' << a[ii].time << endl; //前十单词 vector<word_time> b = the_most_ten(); cout << "word_most" << endl; for (int ii = 0; ii < 10; ii++) cout << b[ii].word << ' ' << b[ii].time << endl; //字符总数 cout << "char_sum" << endl; cout << sum1 << endl; //单词总数 cout << "word_sum" << endl; cout << sum << endl; //总行数 cout << "line_sum" << endl; cout << line_sum << endl; //time_t end = clock(); //cout << "xunxingshijian" << double(start - end) << endl; system("pause"); //vector<word_word_time }*/ /* int main() { bi_word a, b, c, d, e, f; a.str1 = "hello"; a.str2 = "world"; b.str1 = "hello1"; b.str2 = "world"; c.str1 = "hello"; c.str2 = "world2"; d.str1 = "hello"; d.str2 = "fuck33"; e.str1 = "world"; e.str2 = "hello2"; f.str1 = "fucking"; f.str2 = "world"; add_a_bi_word(a); add_a_bi_word(b); add_a_bi_word(c); add_a_bi_word(d); add_a_bi_word(e); add_a_bi_word(f); } */ /* int main() { string h = " hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd"; //string h = " abc"; insert_into_wordlist(h); vector<word_time> ten_word=the_most_ten(); vector<word_word_time>ten_bi_word = the_most_ten_bi(); system("pause"); return 0; } */ /* int main() { //定义结构体,在查找时,该结构体中存储了查找到文件相应的属性 _finddata_t file; //查找所有文件,如果查找失败,则返回-1;查找成功,返回相应的句柄 int k; long HANDLE; k = HANDLE = _findfirst("*.*", &file); //根据相应的句柄,可以依次查找下一个文件;直到无法查询到新的文件为止 while (k != -1) { //cout << file.name << endl; 操作函数放在这 k = _findnext(HANDLE, &file); } _findclose(HANDLE); return 0; }*/ /* int main() { string word = "StRt123546"; if (!(word[0] >= 65 && word[0] <= 90 || word[0] >= 97 && word[0] <= 122)) return 0; string word_ = word; string::iterator it=word.end(); it--; while (!(*it >= 65 && *it <= 90 || *it >= 97 && *it <= 122)) { it--; };//*it不是字母 word.erase(it+1, word.end()); for (it = word.begin(); it - word.begin() < 4; it++) { if (!is_letter(*it)) return 0; } transform(word.begin(), word.end(), word.begin(), ::toupper); word_list[word].time++; //把化简后的word塞入word_list并++次数 if (word_list[word].word == "" || word_list[word].word.compare(word_)) { word_list[word].word = word_; } //如果word_比原来的小 就更新 cout << word << endl; system("pause"); return 0; }*/
现在的调用树热行:
发现还是insert函数过于吃内存,然后看到最底端
发现还是hash_map过于吃CPU了,但是这是数据结构本身的问题了,在调用数据结构的时候这个耗时已经决定了。可以说是优化的终点。
Linux下代码:
#include <dirent.h> #include <sys/stat.h> #include <iostream> #include <unordered_map> #include <string> #include <cctype> #include <algorithm> #include <fstream> #include<time.h> #include <iomanip> using namespace std; class word_time { public: string word; int time; public: word_time(){ this->word = ""; this->time = 0; } }; class word_word_time : public word_time { public: string word_s; word_word_time() { this->time = 0; this->word = ""; this->word_s = ""; } void operator=(const word_word_time &another) { this->time = another.time; this->word = another.word; this->word_s = another.word_s; } }; class bi_word { public: string str1; string str2; bi_word() { this->str1 = ""; this->str2 = ""; } bi_word(const bi_word &another) { this->str1 = another.str1; this->str2 = another.str2; } bool operator==(const bi_word &another) { if (this->str1 == another.str1&&this->str2 == another.str2) return true; return false; } void operator=(const bi_word &another) { this->str1 = another.str1; this->str2 = another.str2; } }; unordered_map<string, word_time> word_list; unordered_map<string, word_word_time> bi_word_list; /* �ж�һ��char�Dz�����ĸ ��������: char */ bool is_letter(char m) { if (m >= 65 && m <= 90 || m >= 97 && m <= 122) return true; return false; } /* �ж�һ��char�Dz��Ƿָ�� */ bool is_fengefu(char m) { if (m >= 65 && m <= 90 || m >= 97 && m <= 122 || m >= 48 && m <= 57) return false; return true; } /* ���һ����ĸ��word_list�У���ͳ�ƴ��� �������ͣ� string */ void add_a_word(string word) { if (!is_letter(word[0])) return; //���word[0]������ĸ��return string word_ = word; string::iterator it; word_time word__time; it = word.end(); it--; while (!is_letter(*it)) { it--; }; //*it������ĸ word.erase(it+1, word.end()); //��ȡǰ��һ���� /*for (it = word.begin(); it - word.begin() < 4; it++) { if (!is_letter(*it)) return; }*/ //���itǰ��λ���Ǵ���ĸ��ֱ�� transform(word.begin(), word.end(), word.begin(), ::toupper); //ת��Ϊ��д //word_time one = word_list[word]; word_list[word].time++; //�ѻ�����word����word_list��++���� if (word_list[word].word == "" || word_list[word].word.compare(word_)>0) { word_list[word].word = word_; } //���word_��ԭ����С ���� } /* ͳ��һ���ַ��� ��������:string */ int count_char_sum(string str) { return(str.length()); } /* ����һ��add_a_bi_word���� */ void add_a_bi_word(bi_word b_word); /* ��һ�еĵ��������word_list,������n-1������,������n-1�����������bi_word_list����ʵ�Ǹ�map�� */ int sum=0; string str_temp,str_now; void insert_into_wordlist(string &line) { vector<vector<string>> wordlist_of_a_line_vec; vector<bi_word> bi_wordlist_of_a_line; bi_word temp; //string::iterator it=line.begin(),it1=line.begin(); int it_last=0,it1; bool flag=false; line.append(" "); for (; is_fengefu(line[it_last])&&(size_t)it_last<line.length(); it_last++); for (int it=it_last; line[it]!= '\0'&& (size_t)it<line.length(); it++) { if (is_fengefu(line[it])) { for (it1 = it_last; it1 - it_last < 4 &&(size_t)it1<line.length(); it1++) { if (!is_letter(line[it1])) { flag = true; break; } }//�ж��Ƿ��ǵ��� ���ǾͶ��� if (flag == false) {//����ǵ��� /*if(wordlist_of_a_line_vec[0].size<40) wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last)); //����������б� else { wordlist_of_a_line_vec[1].push_back(line.substr(it_last, it - it_last)); }*/ sum++; str_now = line.substr(it_last, it - it_last); add_a_word(str_now); if (str_temp != "") { temp.str1 = str_temp; temp.str2 = str_now; add_a_bi_word(temp); } str_temp = str_now; } /*for(int ii=0;wordlist_of_a_line_vec[ii].size()==40;ii++)*/ //wordlist_of_a_line_vec[0].push_back(line.substr(it_last, it - it_last)); flag = false; it_last = it + 1; } } //�������б�һ������map��map���� /*int ii = 0; for (vector<string>::iterator it1 = wordlist_of_a_line_vec[0].begin(); it1 < wordlist_of_a_line_vec[0].end(); it1++) { add_a_word(*it1);//������һ��һ����add���ֵ� if (it1 < wordlist_of_a_line_vec[0].end() - 1) { bi_wordlist_of_a_line.push_back(temp); bi_wordlist_of_a_line[ii].str1 = *it1; bi_wordlist_of_a_line[ii].str2 = *(it1+1); ii++; } //��������б� } //������һ��һ����add���ֵ� for (vector<bi_word>::iterator it2 = bi_wordlist_of_a_line.begin(); it2 < bi_wordlist_of_a_line.end(); it2++) { add_a_bi_word(*it2); }*/ } /* ���ļ��г���Ƶ����ߵ�10������ͳ������ ����һ��vector<word_time> */ vector<word_time> the_most_ten() { vector<word_time> most_ten(10); unordered_map<string, word_time>::iterator it = word_list.begin(); while (it != word_list.end()) { if (it->second.time > most_ten[9].time) { if (it->second.time > most_ten[0].time) most_ten.insert(most_ten.begin(), it->second); else for (int ii = 1; ii<=9; ii++) { if (it->second.time > most_ten[ii].time && it->second.time <= most_ten[ii - 1].time) { most_ten.insert(most_ten.begin() + ii, it->second); break; } } //if(it->second.time > most_ten[0].time) //most_ten.insert(most_ten.begin(), it->second); } it++; } most_ten.erase(most_ten.begin() + 10, most_ten.end()); return most_ten; } /* ͳ���ļ��еĴ�����,����bi_word_list */ void add_a_bi_word(bi_word b_word) { if (!is_letter(b_word.str1[0])|| !is_letter(b_word.str2[0])) return; //���word[0]������ĸ��return bi_word b_word_ = b_word; string::iterator it1,it2; word_word_time word_word__time; it1 = b_word.str1.end(); it2 = b_word.str2.end(); it1--; it2--; while (!is_letter(*it1)) { it1--; }; //*it������ĸ while (!is_letter(*it2)) { it2--; }; b_word.str1.erase(it1 + 1, b_word.str1.end()); //��ȡǰ��һ���� b_word.str2.erase(it2 + 1, b_word.str2.end()); /*for (it1 = b_word.str1.begin(); it1 - b_word.str1.begin() < 4; it1++) { if (!is_letter(*it1)) return; } //���itǰ��λ���Ǵ���ĸ��ֱ�� for (it2 = b_word.str2.begin(); it2 - b_word.str2.begin() < 4; it2++) { if (!is_letter(*it2)) return; }*/ //transform(b_word.str1.begin(), b_word.str1.end(), b_word.str1.begin(), ::toupper); //ת��Ϊ��д //transform(b_word.str2.begin(), b_word.str2.end(), b_word.str2.begin(), ::toupper); for (string::iterator itfirst = b_word.str1.begin(); itfirst < b_word.str1.end(); itfirst++) { if (*itfirst >= 'a') *itfirst -= 32; } for (string::iterator itsecond = b_word.str2.begin(); itsecond < b_word.str2.end(); itsecond++) { if (*itsecond >= 'a') *itsecond -= 32; } string temp = b_word.str1 + b_word.str2; bi_word_list[temp].time++; //�ѻ�����word����word_list��++���� if (bi_word_list[temp].word == "" || (bi_word_list[temp].word+ bi_word_list[temp].word_s).compare(b_word_.str1+b_word_.str2)>0) { bi_word_list[temp].word = b_word_.str1; bi_word_list[temp].word_s = b_word_.str2; } //���word_��ԭ����С ���� } //" hello fucking333 world hello fuck fuck abc fucking231 \n hello sd" /* ���ļ��г���Ƶ����ߵ�10������ͳ������ ����һ��vector<word_word_time> */ vector<word_word_time> the_most_ten_bi() { vector<word_word_time> most_ten_bi(10); word_word_time temp; unordered_map<string, word_word_time>::iterator it = bi_word_list.begin(); while (it != bi_word_list.end()) { /*most_ten_bi[10] = it->second; for (int ii = 10; ii >= 1; ii--) { if (most_ten_bi[ii].time > most_ten_bi[ii - 1].time) { temp = most_ten_bi[ii]; most_ten_bi[ii] = most_ten_bi[ii - 1]; most_ten_bi[ii - 1] = temp; } }*/ if (it->second.time > most_ten_bi[9].time) { if (it->second.time > most_ten_bi[0].time) most_ten_bi.insert(most_ten_bi.begin(), it->second); else for (int ii = 1; ii <= 9; ii++) { if (it->second.time > most_ten_bi[ii].time && it->second.time <= most_ten_bi[ii - 1].time) { most_ten_bi.insert(most_ten_bi.begin() + ii, it->second); break; } } //if(it->second.time > most_ten[0].time) //most_ten.insert(most_ten.begin(), it->second); } it++; } most_ten_bi.erase(most_ten_bi.begin() + 10, most_ten_bi.end()); return most_ten_bi; } /* ������������ļ��к���Ŀ¼ */ long sum1 = 0; int line_sum = 0; void DfsFolder(string lname) { DIR *dir_ptr; struct stat infobuf; struct dirent *direntp; string name, temp; ifstream infile; string text; if ((dir_ptr = opendir(lname.c_str())) == NULL) perror("can not open"); else { while ((direntp = readdir(dir_ptr)) != NULL) { temp = ""; name = direntp->d_name; if (name == "." || name==".." ) { ; } else { temp+=lname; temp+="/"; temp+=name; //strcat(temp, lname); //strcat(temp, "/"); //strcat(temp, name); if ((stat(temp.c_str(), &infobuf)) == -1) printf("#########\n"); if ((infobuf.st_mode & 0170000) == 0040000) { //printf("%s",name); //printf(" this is a directory\n"); DfsFolder(temp); } else { //printf("%s",name); //printf(" this is a file\n"); infile.open(temp, ios::in); //line_sum++; while (getline(infile, text)) { //text.append(temp); //cout << temp << endl; sum1 += text.length(); line_sum++; insert_into_wordlist(text); } if(temp == "") line_sum++; //insert_into_wordlist(text); infile.close(); } } } } closedir(dir_ptr); } int main(int argc, char * argv[]) { string path=argv[1]; DfsFolder(path); ofstream outfile; outfile.open("result.out", ios::out); //outfile.flags(ios::left); outfile << "char_number :" << sum1 << endl; outfile << "line_number :" << line_sum << endl; outfile << "word_number :" << sum << endl; outfile << endl; //outfile.open("result.out", ios::out); vector<word_word_time> a=the_most_ten_bi(); outfile << "the top ten frequency of phrase :" << endl; for (int ii = 0; ii < 10; ii++) outfile << a[ii].word << ' ' << a[ii].word_s <<setw(10) << a[ii].time << endl; vector<word_time> b = the_most_ten(); outfile << endl; outfile << "the top ten frequency of word :" << endl; for (int ii = 0; ii < 10; ii++) outfile << b[ii].word << setw(10) << b[ii].time << endl; outfile.close(); } /* int main() { bi_word a, b, c, d, e, f; a.str1 = "hello"; a.str2 = "world"; b.str1 = "hello1"; b.str2 = "world"; c.str1 = "hello"; c.str2 = "world2"; d.str1 = "hello"; d.str2 = "fuck33"; e.str1 = "world"; e.str2 = "hello2"; f.str1 = "fucking"; f.str2 = "world"; add_a_bi_word(a); add_a_bi_word(b); add_a_bi_word(c); add_a_bi_word(d); add_a_bi_word(e); add_a_bi_word(f); } */ /* int main() { string h = " hello fucking333 world hello fuck fuck fuck abc fucking231 \n hello sd"; //string h = " abc"; insert_into_wordlist(h); vector<word_time> ten_word=the_most_ten(); vector<word_word_time>ten_bi_word = the_most_ten_bi(); system("pause"); return 0; } */ /* int main() { //����ṹ�壬�ڲ���ʱ���ýṹ���д洢�˲��ҵ��ļ���Ӧ������ _finddata_t file; //���������ļ����������ʧ�ܣ��
用grof进行代码分析:
可以看到跑了之后还是 在unordered_map的Hash函数占用了很多CPU 占用很多内存
在调用树的层面上,可以看到遍历、插入unmap的过程非常耗时,但是由于stl的局限性,也有同学试着自己写hash,发现效果还没有stl好。
总结:
这个项目主要是通过unmap模拟字典,当然其实这个项目用python实现效果会更好。所有的单词化简后存入unmap,这样豁免的统计会很方便。