敏感词汇过滤DFA算法

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace SensitiveWordFilter
{
    public class SensitiveWord
    {
        private static readonly char IsEndChar = '$';

        /**
         * 初始化敏感词库<br>
         * 将敏感词加入到HashMap中<br>
         * 构建DFA算法模型
         * 
         * @author dxm
         * 
         */
        public class SensitiveWordInit
        {

            // 字符编码
            private static readonly  String ENCODING = "UTF-8";

            /**
             * 初始化敏感字库
             * 
             * @return
             */
            public Dictionary<char, object> initKeyWord()
            {

                // 读取敏感词库
                HashSet<String> wordSet = readSensitiveWordFile();

                // 将敏感词库加入到HashMap中
                return addSensitiveWordToHashMap(wordSet);
            }

            /**
             * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
             * 中 = { 
             *       isEnd = 0 
             *       国 = {
             *             isEnd = 1 
             *             人 = { 
             *                   isEnd = 0 
             *                   民 = {
             *                         isEnd = 1 
             *                   }
             *             } 
             *             男 = { 
             *                   isEnd = 0 
             *                   人 = { 
             *                         isEnd = 1 
             *                   } 
             *             } 
             *       } 
             * } 
             * 五 = { 
             *       isEnd = 0 
             *       星 = { 
             *             isEnd = 0 
             *             红 = { 
             *                    isEnd = 0 
             *                    旗 = { 
             *                           isEnd = 1 
             *                    }
             *              } 
             *       } 
             * }
             */
            private Dictionary<char, object> addSensitiveWordToHashMap(HashSet<String> wordSet)
            {

                // 初始化敏感词容器,减少扩容操作
                Dictionary<char, object> wordMap = new Dictionary<char, object>(wordSet.Count);

                foreach (String word in wordSet)
                {
                    IDictionary<char, object> nowMap = wordMap;
                    for (int i = 0; i < word.Length; i++)
                    {

                        // 转换成char型
                        char keyChar = word[i];

                        if (keyChar == IsEndChar)
                            continue;

                        Object tempMap;
                        // 获取
                        nowMap.TryGetValue(keyChar, out tempMap);

                        // 如果存在该key,直接赋值
                        if (tempMap != null)
                        {
                            nowMap = (Dictionary<char, object>)tempMap;
                        }

                        // 不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
                        else {

                            // 设置标志位
                            Dictionary<char, object> newMap = new Dictionary<char, object>();
                            newMap.Add(IsEndChar, "0");

                            // 添加到集合
                            nowMap.Add(keyChar, newMap);
                            nowMap = newMap;
                        }

                        // 最后一个
                        if (i == word.Length - 1)
                        {
                            nowMap[IsEndChar] = "1";
                        }
                    }
                }

                return wordMap;
            }

            /**
             * 读取敏感词库中的内容,将内容添加到SortedSet集合中
             * 
             * @return
             * @throws Exception
             */
            private HashSet<String> readSensitiveWordFile()
            {
                HashSet<String> wordSet = new HashSet<string>();
                string content = File.ReadAllText("dic.txt", Encoding.GetEncoding(ENCODING));
                using (StringReader sr = new StringReader(content))
                {
                    string s;
                    while ((s = sr.ReadLine()) != null)
                    {
                        wordSet.Add(s);
                    }
                }
                return wordSet;
            }
        }

        public class SensitivewordFilter
        {

            private Dictionary<char, object> sensitiveWordMap = null;

            // 最小匹配规则
            public static int minMatchTYpe = 1;

            // 最大匹配规则
            public static int maxMatchType = 2;

            // 单例
            private static SensitivewordFilter inst = null;

            /**
             * 构造函数,初始化敏感词库
             */
            private SensitivewordFilter()
            {
                sensitiveWordMap = new SensitiveWordInit().initKeyWord();
            }

            /**
             * 获取单例
             * 
             * @return
             */
            public static SensitivewordFilter getInstance()
            {
                if (null == inst)
                {
                    inst = new SensitivewordFilter();
                }
                return inst;
            }

            /**
             * 判断文字是否包含敏感字符
             * 
             * @param txt
             * @param matchType
             * @return
             */
            public bool isContaintSensitiveWord(String txt, int matchType = 1)
            {
                bool flag = false;
                for (int i = 0; i < txt.Length; i++)
                {

                    // 判断是否包含敏感字符
                    int matchFlag = this.CheckSensitiveWord(txt, i, matchType);

                    // 大于0存在,返回true
                    if (matchFlag > 0)
                    {
                        flag = true;
                    }
                }
                return flag;
            }

            /**
             * 获取文字中的敏感词
             * 
             * @param txt
             * @param matchType
             * @return
             */
            public HashSet<String> getSensitiveWord(String txt, int matchType = 1)
            {
                HashSet<String> sensitiveWordList = new HashSet<String>();

                for (int i = 0; i < txt.Length; i++)
                {

                    // 判断是否包含敏感字符
                    int length = CheckSensitiveWord(txt, i, matchType);

                    // 存在,加入list中
                    if (length > 0)
                    {
                        sensitiveWordList.Add(txt.Substring(i, length));

                        // 减1的原因,是因为for会自增
                        i = i + length - 1;
                    }
                }

                return sensitiveWordList;
            }

            /**
             * 替换敏感字字符
             * 
             * @param txt
             * @param matchType
             * @param replaceChar
             * @return
             */
            public String replaceSensitiveWord(String txt, String replaceChar, int matchType = 1)
            {
                StringBuilder sb = new StringBuilder(txt);
                for (int i = 0; i < txt.Length; i++)
                {

                    // 判断是否包含敏感字符
                    int length = CheckSensitiveWord(txt, i, matchType);

                    // 存在,加入list中
                    if (length > 0)
                    {
                        var ttxt = txt.Substring(i, length);
                        sb.Replace(ttxt, getReplaceChars(replaceChar, ttxt.Length), i, length);

                        // 减1的原因,是因为for会自增
                        i = i + length - 1;
                    }
                }

                return sb.ToString();
            }

            /**
             * 获取替换字符串
             * 
             * @param replaceChar
             * @param length
             * @return
             */
            private String getReplaceChars(String replaceChar, int length)
            {
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < length; i++)
                {
                    sb.Append(replaceChar);
                }

                return sb.ToString();
            }

            /**
             * 检查文字中是否包含敏感字符,检查规则如下:<br>
             * 如果存在,则返回敏感词字符的长度,不存在返回0
             * 
             * @param txt
             * @param beginIndex
             * @param matchType
             * @return
             */
            public int CheckSensitiveWord(String txt, int beginIndex, int matchType)
            {

                // 敏感词结束标识位:用于敏感词只有1位的情况
                bool flag = false;

                // 匹配标识数默认为0
                int matchFlag = 0;
                Dictionary<char, object> nowMap = sensitiveWordMap;
                int tempFlag = 0;
                Dictionary<char, object> tempMapForBack = new Dictionary<char, object>();
                int len = txt.Length;
                for (int i = beginIndex; i < len; i++)
                {
                    char word = txt[i];

                    if (word == IsEndChar)
                        continue;

                    // 获取指定key
                    Object tempMap;
                    // 获取
                    nowMap.TryGetValue(word, out tempMap);

                    if (tempFlag == 0)
                        tempMapForBack = nowMap;

                    // 如果存在该key,直接赋值
                    if (tempMap != null)
                    {
                        nowMap = (Dictionary<char, object>)tempMap;
                    }
                    else
                    {
                        if (tempFlag > 0)
                        {
                            matchFlag = matchFlag - (i - tempFlag);
                            i = tempFlag - 1;
                            nowMap = tempMapForBack;
                            continue;
                        }
                        else
                        {
                            nowMap = null;
                        }
                    }

                    // 存在,则判断是否为最后一个
                    if (nowMap != null)
                    {

                        // 找到相应key,匹配标识+1
                        matchFlag++;

                        object value;

                        if (nowMap.TryGetValue(IsEndChar, out value))
                        {
                            if (value is string)
                            {
                                // 如果为最后一个匹配规则,结束循环,返回匹配标识数
                                if ("1" == (string)value)
                                {
                                    if (nowMap.Keys.Count == 1 || tempFlag != 0 || i == len - 1)
                                    {
                                        // 结束标志位为true
                                        flag = true;

                                        // 最小规则,直接返回,最大规则还需继续查找
                                        if (SensitivewordFilter.minMatchTYpe == matchType)
                                        {
                                            break;
                                        }
                                    }
                                    else
                                    {
                                        tempFlag = i;
                                    }
                                }
                            }
                        }
                    }
                    // 不存在,直接返回
                    else
                    {
                        break;
                    }
                }

                // 长度必须大于等于1,为词
                if (matchFlag < 2 || !flag)
                {
                    matchFlag = 0;
                }
                return matchFlag;
            }
        }
    }
}


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace SensitiveWordFilter
{
    class Program
    {
        static void Main(string[] args)
        {
            SensitiveWord.SensitivewordFilter filter = SensitiveWord.SensitivewordFilter.getInstance();
            String txt = "$fuckfuck you你麻痹e菜太菜了fuckyou从飞啊 fuck you";
            String hou = filter.replaceSensitiveWord(txt, "*");
            Console.WriteLine("替换前的文字为:" + txt);
            Console.WriteLine("替换后的文字为:" + hou);
            Console.ReadKey();
        }
    }
}

猜你喜欢

转载自blog.csdn.net/zhuankeshumo/article/details/50812856
今日推荐