基于Kubernetes的机器学习微服务系统设计系列——(四)中文分词微服务

  中文分词微服务包括分词方法有:RobinSeg(RS)、IKAnalyzer(IK)、JEAnalysis(JE)、MmSeg4j(MS)、PaoDing(PD)、SmallSeg4j(SS)。其中RS分词实现见我的文章:知更鸟中文分词RS设计实现 ,其他分词方法都采用发布的jar包进行封装装。

设计模式

  主要涉及外观模式、适配器模式、工厂模式和单例模式。分词微服务类图如图所示:

中文分词微服务类图

  设计原则:(1)针对接口编程,不要针对实现;(2)只和最紧密的类交互;(3)封装变化;(4)松耦合设计。
  外观模式:提供一个统一的接口,用来访问子系统中的一群接口,外观定义了一个高层接口,让子系统更容易使用。我们采用统一的分词外观类封装各种分词接口,提供一个一致的高层接口。
  适配器模式:将一个类的接口,转换成客户期望的另一个接口。适配器让原本接口不兼容的类可以合作无间。各种分词的的私有实现接口需要一个提供一个统一的接口调用。
  工厂模式:定义一个创建对象的接口,但有子类决定要实例化的类是哪一个。提供统一的分词工厂,创建分类实例对象。
  单例模式:确保一个类只有一个实例,并提供了一个全局访问点。由于各种分词对象的创建、加载词典等需要申请大量的内存,耗费大量的时间,所以所分词器实例都通过适配器进行控制只创建一个实例。

代码实现

中文分词接口抽象类

package com.robin.segment;

import com.robin.log.RobinLogger;
import java.util.logging.Logger;

/**
 * <DT><B>描述:</B></DT>
 * <DD>中文分词接口抽象类</DD>
 *
 * @version Version1.0
 * @author  Robin
 * @version <I> Date:2018-04-18</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public abstract class AbstractSegmenter {

    /** 日志 */
    protected static final Logger LOGGER = RobinLogger.getLogger();

    /**
     * 分词抽象方法
     *
     * @param text 文本
     * @param SEPARATOR 分隔符
     * @return 已分词文本
     */
    public abstract String segment(String text, String SEPARATOR);
}

统一分词器外观类

package com.robin.segment;

import com.robin.log.RobinLogger;
import com.robin.segment.SegmentFactory.SegmentMethod;
import com.robin.segment.robinseg.RobinSeg;
import com.robin.segment.robinseg.SegmentArgs;
import java.util.logging.Logger;

/**
 * <DT><B>描述:</B></DT>
 * <DD>统一分词器外观类</DD>
 * <DD>外观模式</DD>
 *
 * @version 1.0
 * @author Robin
 * @version <I> Date:2018-04-19</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class SegmentFacade {

    // 日志
    private static final Logger LOGGER = RobinLogger.getLogger();

    /**
     * 获取分词器配置参数对象
     *
     * @param methodName 分词方法
     * @return SegmentArgs
     */
    public static SegmentArgs getSegmentArgsObj(SegmentMethod methodName) {
        AbstractSegmenter segment = SegmentFactory.getSegInstance(methodName);
        if (methodName.equals(SegmentMethod.RS)) {
            return ((RobinSeg) segment).getSegmentConfInstance();
        }
        return null;
    }

    /**
     * <DD>根据不同分词算法进行分词,</DD>
     * <DD>传入算法名错误或默认情况下用RobinSeg分词。</DD>
     *
     * @param methodName 分词方法名称,“SegmentMethod.IK”,“.JE”,“.MS”,“.PD”,“.SS”,
     * “.RS”
     * @param text 待分词文本
     * @param separator 分隔符
     * @return 使用分隔符分好词文本
     */
    public static String split(SegmentMethod methodName, String text, String separator) {
        AbstractSegmenter segmenter = SegmentFactory.getSegInstance(methodName);
        return segmenter.segment(text, separator);
    }
}

分词Action实现类

package com.robin.segment.action;

import com.robin.loader.MircoServiceAction;
import com.robin.log.RobinLogger;
import com.robin.segment.SegmentFacade;
import com.robin.segment.SegmentFactory.SegmentMethod;
import com.robin.segment.robinseg.SegmentArgs;
import com.robin.segment.robinseg.SegmentArgs.SegAlgorithm;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.codehaus.jettison.json.JSONArray;
import org.codehaus.jettison.json.JSONException;
import org.codehaus.jettison.json.JSONObject;

/**
 * <DT><B>描述:</B></DT>
 * <DD>分词Action实现类</DD>
 *
 * @version Version1.0
 * @author Robin
 * @version <I> V1.0 Date:2018-06-05</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class SegmentAction implements MircoServiceAction {

    private static final Logger LOGGER = RobinLogger.getLogger();

    public enum StatusCode {
        OK,
        JSON_ERR,
        KIND_ERR,
        VERSION_ERR,
        SEGMETHOD_ERR,
        SEPARATOR_ERR,
        SEGMENT_FAILED,
        TEXTS_NULL,
    }

    private class ActionStatus {

        StatusCode statusCode;
        String msg;

    }

    private JSONObject getErrorJson(ActionStatus actionStatus) {
        JSONObject errJson = new JSONObject();
        try {
            errJson.put("status", actionStatus.statusCode.toString());
            errJson.put("msg", actionStatus.msg);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }
        return errJson;
    }

    private ActionStatus checkJSONObjectTerm(JSONObject jsonObj,
            String key,
            HashSet<String> valueSet,
            StatusCode errStatusCode) {
        ActionStatus actionStatus = new ActionStatus();

        try {
            if (!jsonObj.isNull(key)) {
                String value = jsonObj.getString(key);
                if (!valueSet.contains(value)) {
                    actionStatus.msg = "The value [" + value + "] of " + key + " is error.";
                    actionStatus.statusCode = errStatusCode;
                    return actionStatus;
                }
            } else {
                actionStatus.msg = "The input parameter is missing " + key + ".";
                actionStatus.statusCode = errStatusCode;
                return actionStatus;
            }

        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }

        actionStatus.statusCode = StatusCode.OK;
        return actionStatus;
    }

    private ActionStatus checkInputJSONObject(JSONObject jsonObj) {
        ActionStatus actionStatus = new ActionStatus();
        ActionStatus retActionStatus;

        JSONObject argsJson;
        HashSet<String> valueSet = new HashSet();

        try {
            valueSet.add("segment");
            retActionStatus = checkJSONObjectTerm(jsonObj, "kind", valueSet, StatusCode.KIND_ERR);
            if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
                return retActionStatus;
            }

            valueSet.clear();
            valueSet.add("v1");
            retActionStatus = checkJSONObjectTerm(jsonObj, "version", valueSet, StatusCode.VERSION_ERR);
            if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
                return retActionStatus;
            }

            JSONObject segmentMetadata = jsonObj.getJSONObject("metadata").getJSONObject("segment");

            valueSet.clear();
            valueSet.add("RS");
            valueSet.add("IK");
            valueSet.add("JE");
            valueSet.add("MS");
            valueSet.add("PD");
            valueSet.add("SS");
            retActionStatus = checkJSONObjectTerm(segmentMetadata, "method", valueSet, StatusCode.SEGMETHOD_ERR);
            if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
                return retActionStatus;
            }

            valueSet.clear();
            valueSet.add(" ");
            valueSet.add("|");
            valueSet.add("/");
            retActionStatus = checkJSONObjectTerm(segmentMetadata, "separator", valueSet, StatusCode.SEPARATOR_ERR);
            if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
                return retActionStatus;
            }

            // 设置RobinSeg分词参数
            String method = segmentMetadata.getString("method");
            SegmentMethod segmentMethod = SegmentMethod.valueOf(method);
            if ((segmentMethod.equals(SegmentMethod.RS)) && (!segmentMetadata.isNull("args"))) {
                argsJson = segmentMetadata.getJSONObject("args");
                SegmentArgs segmentArgs = SegmentFacade.getSegmentArgsObj(segmentMethod);
                if (null != segmentArgs) {
                    if (!argsJson.isNull("algorithm")) {
                        String algorithm = argsJson.getString("algorithm");
                        segmentArgs.setSegAlgorithm(SegAlgorithm.valueOf(algorithm.toUpperCase()));
                    }
                    if (!argsJson.isNull("cleanSymbol")) {
                        Boolean flag = argsJson.getBoolean("cleanSymbol");
                        segmentArgs.setCleanSymbolFlag(flag);
                    }
                    if (!argsJson.isNull("markNewWord")) {
                        Boolean flag = argsJson.getBoolean("markNewWord");
                        segmentArgs.setMarkNewWordFlag(flag);
                    }
                    if (!argsJson.isNull("downcasing")) {
                        Boolean flag = argsJson.getBoolean("downcasing");
                        segmentArgs.setDowncasingFlag(flag);
                    }
                    if (!argsJson.isNull("mergePattern")) {
                        Boolean flag = argsJson.getBoolean("mergePattern");
                        segmentArgs.setMergePatternFlag(flag);
                    }
                    if (!argsJson.isNull("retrievalPattern")) {
                        Boolean flag = argsJson.getBoolean("retrievalPattern");
                        segmentArgs.setRetrievalPatternFlag(flag);
                    }
                }
            }
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }

        actionStatus.statusCode = StatusCode.OK;

        return actionStatus;
    }

    @Override
    public Object action(Object obj) {

        ActionStatus actionStatus = new ActionStatus();
        ActionStatus retActionStatus;

        if (!(obj instanceof JSONObject)) {
            actionStatus.msg = "The action arguments is not JSONObject.";
            LOGGER.log(Level.SEVERE, actionStatus.msg);
            actionStatus.statusCode = StatusCode.JSON_ERR;
            return this.getErrorJson(actionStatus);
        }

        JSONObject jsonObj = (JSONObject) obj;
        retActionStatus = this.checkInputJSONObject(jsonObj);
        if (!retActionStatus.statusCode.equals(StatusCode.OK)) {
            LOGGER.log(Level.SEVERE, retActionStatus.msg);
            return this.getErrorJson(retActionStatus);
        }

        SegmentMethod segmentMethod;
        String separator;
        JSONObject texts;

        try {
            JSONObject segmentMetadata = jsonObj.getJSONObject("metadata").getJSONObject("segment");
            String method = segmentMetadata.getString("method");
            segmentMethod = SegmentMethod.valueOf(method);
            separator = segmentMetadata.getString("separator");
            texts = jsonObj.getJSONObject("texts");
            long beginTime = System.currentTimeMillis();
            if (null == texts) {
                actionStatus.statusCode = StatusCode.TEXTS_NULL;
                actionStatus.msg = "The input texts is null.";
                LOGGER.log(Level.SEVERE, actionStatus.msg);
                return this.getErrorJson(actionStatus);
            }

            Iterator labelsIt = texts.keys();
            while (labelsIt.hasNext()){
                String label = (String) labelsIt.next();
                JSONArray aLabelTexts = texts.getJSONArray(label);
                int len = aLabelTexts.length();
                for (int i = 0; i < len; i++) {
                    JSONObject textJson = aLabelTexts.getJSONObject(i);
                    String text = textJson.getString("text");
                    if (null != text) {
                        String result = SegmentFacade.split(segmentMethod, text, separator);
                        textJson.put("text", result);
                    }
                }
            }

            long endTime = System.currentTimeMillis();
            int spendTime = (int) (endTime - beginTime);
            segmentMetadata.put("spendTime", spendTime);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }

        JSONObject rsp = new JSONObject();
        try {
            rsp.put("status", "OK");
            rsp.put("result", jsonObj);
        } catch (JSONException ex) {
            LOGGER.log(Level.SEVERE, ex.getMessage());
        }
        return rsp;
    }
}

分词实例工厂方法类

package com.robin.segment;

import com.robin.segment.adapter.SmallSeg4jAdapter;
import com.robin.segment.adapter.MmSeg4jAdapter;
import com.robin.segment.adapter.IKAnalyzerAdapter;
import com.robin.segment.adapter.JEAnalysisAdapter;
import com.robin.segment.adapter.PaoDingAdapter;
import com.robin.log.RobinLogger;
import com.robin.segment.robinseg.RobinSeg;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * <DT><B>描述:</B></DT>
 * <DD>分词实例工厂方法类</DD>
 *
 * @version Version1.0
 * @author  Robin
 * @version <I> Date:2018-04-19</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class SegmentFactory {

    // 日志
    private static final Logger LOGGER = RobinLogger.getLogger();

    /** 分词算法名称标记 */
    public enum SegmentMethod {

        /** JE  = "JEAnalysis" */
        JE,
        /** IK  = "IKAnalyzer"*/
        IK,
        /** MS  = "MmSeg4j" */
        MS,
        /** PD  = "PaoDing" */
        PD,
        /** SS  = "SmallSeg4j" */
        SS,
        /** RS  = "RobinSeg" */
        RS
    }

    /**
     * 创建具体分词类实例
     *
     * @param methodName 分词方法名称,“SegmentMethod.IK”,“.JE”,“.MS”,“.PD”,“.SS”,“.RS”
     * @return 具体分词方法实例
     */
    public static AbstractSegmenter getSegInstance(SegmentMethod methodName) {
        if (null == methodName) {
            methodName = SegmentMethod.RS;
        }
        switch (methodName) {
            case JE:
                return JEAnalysisAdapter.getInstance();
            case IK:
                return IKAnalyzerAdapter.getInstance();
            case MS:
                return MmSeg4jAdapter.getInstance();
            case PD:
                return PaoDingAdapter.getInstance();
            case SS:
                return SmallSeg4jAdapter.getInstance();
            case RS:
                return RobinSeg.getInstance();
            default:
                LOGGER.log(Level.WARNING, "分词方法名称错误,默认采用RobinSeg分词.");
                return RobinSeg.getInstance();
        }
    }
}

IK适配器类

package com.robin.segment.adapter;

import com.robin.segment.AbstractSegmenter;
import java.io.IOException;
import java.io.StringReader;
import java.util.logging.Level;

import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;

/**
 * <DT><B>描述:</B></DT>
 * <DD>适配IKAnalyzer3.2.0分词器</DD>
 * <DD>适配器模式、单例模式</DD>
 *
 * @version Version1.0
 * @author  Robin
 * @version <I> Date:2018-04-17</I>
 * @author  <I> E-mail:[email protected]</I>
 */
public class IKAnalyzerAdapter extends AbstractSegmenter {

    /** 分词实例 */
    protected static AbstractSegmenter instance = null;

    private IKAnalyzerAdapter() {
    }

    /**
     * 使用给定分隔符分词
     *
     * @param text 待分词文本
     * @param separator
     * @return 分好词文本
     */
    @Override
    public String segment(String text, String separator) {

        //防御性编程
        if (null == text || "".equals(text)) {
            return "";
        }

        //使用最大词匹配建立分词器
        IKSegmentation ikSeg = new IKSegmentation(new StringReader(text), true);
        StringBuilder sb = new StringBuilder();
        try {
            Lexeme l = null;
            while ((l = ikSeg.next()) != null) {
                sb.append(l.getLexemeText().concat(separator));
            }
        } catch (IOException e) {
            LOGGER.log(Level.SEVERE, e.getMessage());
        }

        return sb.toString();
    }

    /**
     * 获取 IKAnalyzer 分词类的实例
     *
     * @return 分词类的单实例
     */
    public static AbstractSegmenter getInstance() {
        if (null == instance) {
            instance = new IKAnalyzerAdapter();
        }
        return instance;
    }
}

请求JSON

  中文分词微服务请求JSON格式如下,红框标示了请求参数和原始文本。

响应JSON

  中文分词微服务响应JSON格式如下,红框标示分词消耗时间和分词结果。

猜你喜欢

转载自blog.csdn.net/xsdjj/article/details/83903944