基于百度brpc框架的简易站内搜索引擎

因为之前坐了一个网站，完成了简单的站内搜索的功能，但是效率太低，每次都要从数据库中提取数据出来。于是花了一点时间改善了这个站内搜索的功能，通过倒排索引和正排索引来实现搜索功能。

注意：本项目用到的库都需要自行安装（cppjieba，jsoncpp，brpc，gflags，protobuf，leveldb），后三个是brpc要求安装的

思路：

一.首先实现一个制作索引的线下程序

从数据库中先拿取到索引源，即所有的索引都是从这些数据中取出来的。因为我的项目是一个新闻系统，于是我就取出了对应新的ID，title，author，content以及date，所有的关键字来源于title，author，content。
通过cppjiaba分词库，对关键字来源进行分词，所有的分词结果都对应该新闻的id，所以这里用一个哈希表来存储（key是分词出来的关键词，value是这个关键词在具体哪个新闻出现的id）
因为同一个关键词可能在不同新闻中出现，所以上面的哈希表的value我设置了一个unordered_set，用一个集合来存放一个关键词出现在哪些新闻的集合
以上制作出来的哈希表就是一个倒排索引（通过关键词拿到新闻id）
正排索引很简单，也是用一个哈希表来存放（key是新闻id，value是新闻对应的信息，包括title，author，content和date），因为它的value也是一个键值对（title：xxx），所以这个哈希表的定义类型是这样的：unordered_map<stirng,unordered_map<string,string> >
接着通过jsoncpp将倒排索引和正排索引的数据写到文件中，制作成json数据格式（文件类型.json），我们可以看一下文件中的内容，是如何封装的

正排索引文件

倒排索引文件

下面是制作索引的程序代码

word_segmentation.hpp（分词头文件）

#pragma once
#include "/home/pigff/third-part-lib/cppjieba/include/cppjieba/Jieba.hpp"
#include <iostream>
#include <string>
#include <vector>
using std::cout;
using std::endl;
using std::string;
using std::vector;

//最大概率法(MPSegment: Max Probability)分词所使用的词典路径
const char * const DICT_PATH = "/home/pigff/third-part-lib/cppjieba/dict/jieba.dict.utf8";
//隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典路径
const char * const HMM_PATH = "/home/pigff/third-part-lib/cppjieba/dict/hmm_model.utf8";
//用户自定义词典路径
const char * const USER_DICT_PATH = "/home/pigff/third-part-lib/cppjieba/dict/user.dict.utf8";
//IDF路径
const char* const IDF_PATH = "/home/pigff/third-part-lib/cppjieba/dict/idf.utf8";
//停用词路径
const char* const STOP_WORD_PATH = "/home/pigff/third-part-lib/cppjieba/dict/stop_words.utf8";

class WordSegmentation//使用结巴分词库进行分词
{
public: 
    WordSegmentation() 
        :_jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH)
    {} 

    vector<string> operator()(const string str){ 
        //返回str的分词结果
        vector<string> words; 
        _jieba.CutAll(str, words);//FullSegment 
        return words; 
    }
private:
    cppjieba::Jieba _jieba;
};

buind_index.cc（制作索引程序）

扫描二维码关注公众号，回复： 3095951 查看本文章

#include "word_segmentation.hpp"
#include <mysql/mysql.h>
#include <unordered_map>
#include <unordered_set>
#include <unistd.h>
#include <fcntl.h>
#include <jsoncpp/json/value.h>
#include <jsoncpp/json/json.h>
#include <fstream>

struct All_News_Info{
    vector<string> ids;
    vector<string> titles;
    vector<string> authors;
    vector<string> contents;
    vector<string> dates;
};
//从数据库拿取数据
//数据来源是新闻的标题和内容
void SearchData(All_News_Info& infos){
    MYSQL* conn;
    conn = mysql_init(NULL);
    //设置连接的字符集为utf8，不然没法显示中文
    mysql_set_character_set(conn,"utf8");
    if(conn == NULL)
        cout << "Error " << mysql_errno(conn) << ": " << mysql_error(conn);
    if(mysql_real_connect(conn,"localhost","root","1","news",0,NULL,0) == NULL)
        cout << "Error " << mysql_errno(conn) << ": " << mysql_error(conn);

    MYSQL_RES* result;
    MYSQL_ROW row;
    
    mysql_query(conn,"select id,title,author,content,createdate from news");
    result = mysql_store_result(conn);
    while((row = mysql_fetch_row(result))){
        infos.ids.push_back(row[0]);
        infos.titles.push_back(row[1]);
        infos.authors.push_back(row[2]);
        infos.contents.push_back(row[3]);
        infos.dates.push_back(row[4]);
    }
}

//存储索引(正排索引和倒排索引)
//正排索引（文档id->文档的全部信息，用来查看文档中关键词的出现次数）
//根据关键词在不同文档中的出现次数，排序搜索结果，次数越高的关联越大）
//倒排索引（关键词->文档id，用来查看关键词在哪些文档中出现过）
void Save_index(const All_News_Info& infos){
    WordSegmentation wordSeg; 
    vector<string> results; 
    std::unordered_map<string,std::unordered_set<string>> inverted_index;
    std::unordered_map<string,std::unordered_map<string,string>> forward_index;
    for(size_t i = 0;i < infos.titles.size();++i){
        //把有关标题，作者和内容的分词结果放到倒排索引中
        results = wordSeg(infos.titles[i]);
        for(auto it:results)
            inverted_index[it].insert(infos.ids[i]);

        results = wordSeg(infos.contents[i]);
        for(auto it:results)
            inverted_index[it].insert(infos.ids[i]);

        results = wordSeg(infos.authors[i]);
        for(auto it:results)
            inverted_index[it].insert(infos.ids[i]);

        //把对应id下的所有信息放到正排索引中
        forward_index[infos.ids[i]]["title"] = infos.titles[i];
        forward_index[infos.ids[i]]["author"] = infos.authors[i];
        forward_index[infos.ids[i]]["content"] = infos.contents[i];
        forward_index[infos.ids[i]]["date"] = infos.dates[i];
    }

    //将json数据保存到文件中
    Json::Value root1,root2;
    for(auto it:inverted_index){
        string str = "";
        for(auto it2 :it.second)
            str += it2 + " ";
        root1[it.first] = str; 
    }
    for(auto it:forward_index){
        Json::Value partner;
        for(auto it2:it.second)
            partner[it2.first] = it2.second;
        root2[it.first] = partner;
    }
    
    Json::StyledWriter sw;
    std::ofstream os1,os2;
    os1.open("inverted_index.json");
    os2.open("forward_index.json");
    os1 << sw.write(root1);
    os2 << sw.write(root2);
    os1.close();
    os2.close();
}

int main(){
    All_News_Info infos;
    SearchData(infos);
    Save_index(infos);

    return 0;
}

二.封装对索引文件的操作接口（为之后的索引服务器提供）

Load接口：将索引文件加载到内存当中
Find接口：给一个参数keyword，对这个keyword进行分词，并对所有的分词结果，在倒排索引中查找，在哪些文件中出现过（这个时候可能一个文件出现了不止一次，所以要记下每个文件出现的次数）
Sort接口：对所有出现过的id根据出现次数进行排序达到关联度匹配的目的（出现次数越高的关联度越高，放在越前面），这里本质上就是对一个map进行按value的排序
ReturnInfo接口：现在拿到了一组排序好的id，这个时候根据这组id通过正排索引文件，拿到它们的内容，然后返回

下面是设计接口的程序代码

search_engine.h

#pragma once

#include <iostream>
#include <fstream>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <boost/algorithm/string.hpp>
#include <jsoncpp/json/json.h>
#include "word_segmentation.hpp"

using std::string;
struct News_Info{
    string title;
    string author;
    string date;

    News_Info(string title_,string author_,string date_)
        :title(title_),
         author(author_),
         date(date_)
    {}
};

void Load( std::unordered_map<string,std::unordered_set<string>>& inverted_index,
           std::unordered_map<string,std::unordered_map<string,string>>& forward_index);

bool Find(std::unordered_map<string,std::unordered_set<string>> inverted_index,
          string keyword,std::unordered_map<string,int>& ids);

vector<std::pair<string,int> > Sort(std::unordered_map<string,int> ids);

vector<News_Info> ReturnInfo(const vector<std::pair<string,int> >& sort_ids,
                             std::unordered_map<string,std::unordered_map<string,string>> forward_index);

search_engine.cc

#include "search_engine.h"

//将索引文件加载到哈希表
void Load( std::unordered_map<string,std::unordered_set<string>>& inverted_index,
           std::unordered_map<string,std::unordered_map<string,string>>& forward_index){
    Json::Reader reader;
    Json::Value value;
    std::ifstream is1,is2;
    is1.open("/home/pigff/project/search_engine/inverted_index.json");
    is2.open("/home/pigff/project/search_engine/forward_index.json");
    reader.parse(is1,value);
    vector<string> names = value.getMemberNames();
    for(auto it:names){
        vector<string> v;
        string tmp = value[it].asString();
        boost::algorithm::split(v,tmp,boost::algorithm::is_space());
        for(auto it2:v)
            inverted_index[it].insert(it2);
    }
    value.clear();
    reader.parse(is2,value);
    names.clear();
    names = value.getMemberNames();
    for(auto it:names){
        vector<string> names_names = value[it].getMemberNames();
        for(auto it2:names_names)
            forward_index[it].insert(make_pair(it2,value[it][it2].asString()));
    }
}

//对一个搜索的关键词进行分词
bool Find(std::unordered_map<string,std::unordered_set<string>> inverted_index,string keyword,std::unordered_map<string,int>& ids){
    //对给进来的关键字进行分词
    WordSegmentation wordSeg;
    vector<string> results = wordSeg(keyword);
     
    for(auto it:results){
        if(inverted_index[it].empty())
            continue;
        else{
            for(auto it2: inverted_index[it]){
                //下面的判断是因为对文件的分割结果有可能有一个是空
                if(!it2.empty())
                    ids[it2]++;
            }
        }
    }

    if(ids.empty())
        return false;

    return true;
}

//对搜索出的新闻结果按照匹配度排序
//本质上就是对一个map进行按value的排序
//因为sort只可以对顺序容器进行排序
//所以我们要先把数据存到一个vector中
//采用的方式是对sort算法采用一个仿函数Compare
class Compare{
public:
    bool operator()(const std::pair<string,int>& x, const std::pair<string,int>& y) {
        return x.second < y.second;         
    }
};
vector<std::pair<string,int> > Sort(std::unordered_map<string,int> ids){
    vector<std::pair<string,int> > ret(ids.begin(),ids.end());
    sort(ret.begin(),ret.end(),Compare());
    return ret;
}

//根据最终排序好的id取出对于的结构体数组
vector<News_Info> ReturnInfo(const vector<std::pair<string,int> >& sort_ids,
                             std::unordered_map<string,std::unordered_map<string,string>> forward_index){
    vector<News_Info> ret;
    for(auto i:sort_ids){
        News_Info news_info(forward_index[i.first]["title"],
                            forward_index[i.first]["author"],
                            forward_index[i.first]["date"]);
        ret.push_back(news_info);
    }
    return ret;
}

三.制作搜索引擎服务器以及搜索客户端接口

这里我们采用百度的brpc开源框架来搭建我们的服务器（也是因为我想用一下别人的的开源框架=_=）
根据brpc框架，我们需要先制作一个.proto文件（brpc是基于protobuf的，具体怎么制作还请去brpc官网看教程，或者我在后续的学习中，也会说到）
其次其实就是实现一个brpc的搜索服务器了，这里具体怎么制作不过多阐释，都可以去官网找找例子，另外官方文档都有中文版哦，这个服务器是放到后台一直执行的。
分装一个客户端接口给我们的搜索CGI实现，在HTTP服务器拿到一个关键字后，通过这个客户端接口与服务器交互

下面是具体程序代码

SG.proto

syntax = "proto2";
// 告诉protoc要生成++ Service基类
option cc_generic_services = true;

package SG;           // 定义了package的名字


message Request {
    optional string keyword = 1;
};

message Response {
    repeated Info info = 1; 
};

message Info{
    optional  string  title   = 1;
    optional  string  author  = 2;
    optional  string  date    = 3;
};

 
service Service {
    rpc Search(Request) returns (Response);
};

server.cc

#include <gflags/gflags.h>
#include <brpc/server.h>
#include <butil/logging.h>
#include <brpc/stream.h>
#include "/home/pigff/project/search_engine/search_engine.h"
#include "SG.pb.h"

DEFINE_bool(attachment, true, "Echo attachment as well");      
DEFINE_int32(port, 9999, "TCP Port of this server");
DEFINE_int32(idle_timeout_s, -1, "Connection will be closed if there is no "
                          "read/write operations during the last `idle_timeout_s'");
DEFINE_int32(logoff_ms, 2000, "Maximum duration of server's LOGOFF state "
                          "(waiting for client to close connection before server stops)");

namespace Search{
//实现proto中的Service基类
class SearchService : public SG::Service{
public:
    SearchService(){
        //在构造函数中调用Load加载好索引文件
        Load(inverted_index,forward_index);   
    }

    void Search(google::protobuf::RpcController* cntl_base,
                const SG::Request* req,
                SG::Response* resp,
                google::protobuf::Closure* done){
        //这个对象确保在return时自动调用done->Run()
        brpc::ClosureGuard done_guard(done);

        brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_base);
        
        //输出日志了解客户端如何与服务器交互
        LOG(INFO) << "Received request[log_id=" << cntl->log_id() 
            << "] from " << cntl->remote_side() 
            << " to " << cntl->local_side()
            << ": " << req->keyword()
            << " (attached=" << cntl->request_attachment() << ")";

        //在倒排哈希表中根据关键词分词查找
        std::unordered_map<string,int> ids;
        // 如果找不到则响应的几个参数都不设置，都为空
        if(Find(inverted_index,req->keyword(),ids) == false)
             return;

        //对搜索出的结果进行关联度排序
        vector<std::pair<string,int> > sort_ids = Sort(ids);
           
        //取出排序好的id的相关内容
        vector<News_Info> infos = ReturnInfo(sort_ids,forward_index); 

        for(size_t i = 0;i < infos.size();++i)
            resp->add_info();
        //给响应写东西
        for(int i = 0;i < resp->info_size();++i){
            SG::Info* info = resp->mutable_info(i);
            info->set_title(infos[i].title);
            info->set_author(infos[i].author);
            info->set_date(infos[i].date);
        }

        if(FLAGS_attachment){
            //设置连接到网络的附件而不是被序列化的protobuf信息
            cntl->response_attachment().append(cntl->request_attachment()); 
        }
    }    
private:
    std::unordered_map<string,std::unordered_set<string>> inverted_index;
    std::unordered_map<string,std::unordered_map<string,string>> forward_index;
};

}//end namespace

int main(int argc,char* argv[]){
    daemon(1,1);
    //解析GFLAGS
    gflags::ParseCommandLineFlags(&argc,&argv,true);
    
    //服务器对象
    brpc::Server server;
    
    //proto服务的实例
    Search::SearchService search_service;

    //将服务添加到服务器中
    //第二个参数是因为服务放在堆栈上，我们不希望服务器删除它
    //如果想要删除可以用brpc::SERVER_OWNS_SERVICE
    if(server.AddService(&search_service,brpc::SERVER_DOESNT_OWN_SERVICE) != 0){
        LOG(ERROR) << "Fail to start SearchServer";
        return -1;
    }

    // Start the server.
    brpc::ServerOptions option;
    option.idle_timeout_sec = FLAGS_idle_timeout_s;
    if (server.Start(FLAGS_port, &option) != 0) {
        LOG(ERROR) << "Fail to start EchoServer";
        return -1;
    }
    //直到按下Ctrl-c，才停止服务器
    server.RunUntilAskedToQuit();
    return 0;
}

client.h

#pragma once 

#include <gflags/gflags.h>
#include <brpc/channel.h>
#include <butil/time.h>
#include <butil/logging.h>
#include <brpc/stream.h>
#include <boost/algorithm/string.hpp>
#include "/home/pigff/project/search_server/SG.pb.h"
#include "/home/pigff/project/search_engine/search_engine.h"

using std::string;

DECLARE_string(protocol);
DECLARE_string(search_attachment);
DECLARE_string(connection_type);
DECLARE_string(search_server);
DECLARE_string(load_balancer);
DECLARE_int32(timeout_ms);
DECLARE_int32(max_retry); 
DECLARE_int32(interval_ms);
DECLARE_string(http_content_type);

class Client{
public:
    Client();
    vector<News_Info> Return(string keyword);
private:
    //客户端对象
    brpc::Channel channel;
    //proto服务的实例
    brpc::ChannelOptions options;
};

client.cc

#include "client.h"

DEFINE_string(protocol, "baidu_std", "Protocol type. Defined in src/brpc/options.proto");
DEFINE_string(search_attachment, "foo", "Carry this along with requests");
DEFINE_string(connection_type, "", "Connection type. Available values: single, pooled, short");
DEFINE_string(search_server, "0.0.0.0:9999", "IP Address of server");
DEFINE_string(load_balancer, "", "The algorithm for load balancing");
DEFINE_int32(timeout_ms, 3000, "RPC timeout in milliseconds");
DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); 
DEFINE_int32(interval_ms, 5000, "Milliseconds between consecutive requests");
DEFINE_string(http_content_type, "application/json", "Content type of http request");

Client::Client(){
    options.protocol = FLAGS_protocol;
    options.connection_type = FLAGS_connection_type;
    options.timeout_ms = FLAGS_timeout_ms;
    options.max_retry = FLAGS_max_retry;
    channel.Init(FLAGS_search_server.c_str(),FLAGS_load_balancer.c_str(),&options); 
}
vector<News_Info> Client::Return(string keyword){
     // Normally, you should not call a Channel directly, but instead construct
        // a stub Service wrapping it. stub can be shared by all threads as well.
    SG::Service_Stub stub(&channel);
    
    SG::Request req;
    SG::Response resp;
    brpc::Controller cntl;
    
    req.set_keyword(keyword);
    
    if (FLAGS_protocol != "http" && FLAGS_protocol != "h2c")  {
        // Set attachment which is wired to network directly instead of 
        // being serialized into protobuf messages.
        cntl.request_attachment().append(FLAGS_search_attachment);
    } else {
        cntl.http_request().set_content_type(FLAGS_http_content_type);
    }
    
    // Because `done'(last parameter) is NULL, this function waits until
    // the response comes back or error occurs(including timedout).
    stub.Search(&cntl, &req, &resp, NULL);
    vector<News_Info> v;
    if (!cntl.Failed()) {
        //写回数据给客户端
        for(auto i: resp.info()){
            News_Info info(i.title(),i.author(),i.date());
            v.push_back(info);
        }
    }else 
        LOG(WARNING) << cntl.ErrorText();

    return v;
}

基于百度brpc框架的简易站内搜索引擎

猜你喜欢