unity c#非法字符(脏词)检测

项目中非法字符检测是必须的,聊天系统不屏蔽各种不文明用语

先说说我的原理吧

1.读取非法字符表,把相同的首字符归类到字典,类似新华字典那样

2.然后把输入的字符串,一个个字符找对应的首字符字典,遍历首字符字典,在当前字符后面截取对应的字符长度得到的字符串然后比较,如果字符串相同则认为有非法字符

下面是测试结果

下面为完整代码,有注释应该比较容易看懂


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using UnityEngine;

/// <summary>
/// 非法关键词过滤(自动忽略汉字数字字母间的其他字符)
/// </summary>
public class FilterWord
{
    public FilterWord()
    {
        TextAsset asset = Resources.Load("dirtywords") as TextAsset;
        m_AllFilterWord = asset.text;
    }

    private string m_AllFilterWord = string.Empty;
    /// <summary>
    /// 词库路径
    /// </summary>
    public string AllFilterWord
    {
        get { return m_AllFilterWord; }
        set { m_AllFilterWord = value; }
    }

    /// <summary>
    /// 内存词典
    /// </summary>
    private WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];

    private string sourctText = string.Empty;
    private bool m_IsInitalize = false;
    /// <summary>
    /// 检测源
    /// </summary>
    public string SourceText
    {
        get { return sourctText; }
        set { sourctText = value; }
    }

    /// <summary>
    /// 检测源游标
    /// </summary>
    int cursor = 0;

    /// <summary>
    /// 匹配成功后偏移量
    /// </summary>
    int wordlenght = 0;

    /// <summary>
    /// 检测词游标
    /// </summary>
    int nextCursor = 0;

    private List<string> illegalWords = new List<string>();

    /// <summary>
    /// 检测到的非法词集
    /// </summary>
    public List<string> IllegalWords
    {
        get { return illegalWords; }
    }

    /// <summary>
    /// 判断是否是中文
    /// </summary>
    /// <param name="character"></param>
    /// <returns></returns>
    private bool isCHS(char character)
    {
        //  中文表意字符的范围 4E00-9FA5
        int charVal = (int)character;
        return (charVal >= 0x4e00 && charVal <= 0x9fa5);
    }

    /// <summary>
    /// 判断是否是数字
    /// </summary>
    /// <param name="character"></param>
    /// <returns></returns>
    private bool isNum(char character)
    {
        int charVal = (int)character;
        return (charVal >= 48 && charVal <= 57);
    }

    /// <summary>
    /// 判断是否是字母
    /// </summary>
    /// <param name="character"></param>
    /// <returns></returns>
    private bool isAlphabet(char character)
    {
        int charVal = (int)character;
        return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
    }

    /// <summary>
    /// 转半角小写的函数(DBC case)
    /// </summary>
    /// <param name="input">任意字符串</param>
    /// <returns>半角字符串</returns>
    ///<remarks>
    ///全角空格为12288,半角空格为32
    ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
    ///</remarks>
    private string ToDBC(string input)
    {
        char[] c = input.ToCharArray();
        for (int i = 0; i < c.Length; i++)
        {
            if (c[i] == 12288)
            {
                c[i] = (char)32;
                continue;
            }
            if (c[i] > 65280 && c[i] < 65375)
                c[i] = (char)(c[i] - 65248);
        }
        return new string(c).ToLower();
    }

    /// <summary>
    /// 加载内存词库
    /// </summary>
    public void LoadDictionary()
    {
        if (m_IsInitalize)
        {
            return;
        }

        m_IsInitalize = true;
        List<string> wordList = new List<string>();
        Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
        string[] words = AllFilterWord.Split('\n');
        foreach (string word in words)
        {
            string str = word.Replace("\r", "");
            string key = this.ToDBC(str);
            wordList.Add(key);
        }
        Comparison<string> cmp = delegate (string key1, string key2)
        {
            return key1.CompareTo(key2);
        };
        wordList.Sort(cmp);
        for (int i = wordList.Count - 1; i > 0; i--)
        {
            if (wordList[i].ToString() == wordList[i - 1].ToString())
            {
                wordList.RemoveAt(i);
            }
        }
        foreach (var word in wordList)
        {
            if (string.IsNullOrEmpty(word))
            {
                continue;
            }
            WordGroup group = MEMORYLEXICON[word[0]];
            if (group == null)
            {
                group = new WordGroup();
                MEMORYLEXICON[(int)word[0]] = group;

            }
            group.Add(word.Substring(1));
        }
    }

    /// <summary>
    /// 检测
    /// </summary>
    /// <param name="blackWord"></param>
    /// <returns></returns>
    private bool Check(string blackWord)
    {
        wordlenght = 0;
        //检测源下一位游标
        nextCursor = cursor + 1;
        bool found = false;
        string tempStr = ToDBC(sourctText);
        //遍历词的每一位做匹配
        for (int i = 0; i < blackWord.Length; i++)
        {
            //特殊字符偏移游标
            int offset = 0;
            if (nextCursor >= tempStr.Length)
            {
                break;
            }
            else
            {
                if (i >= blackWord.Length
                    || nextCursor + offset >= tempStr.Length)
                {
                    found = false;
                    break;
                }
                if ((int)blackWord[i] == (int)tempStr[nextCursor + offset])
                {
                    if (isAlphabet(tempStr[nextCursor + offset]))
                    {
                        if(tempStr.Length < blackWord.Length)
                        {
                            found = false;
                            break;
                        }
                        if (i >= blackWord.Length - 1)
                        {
                            int temp = nextCursor + offset + 1;
                            if(tempStr.Length > temp)
                            {
                                if(isAlphabet(tempStr[temp]))
                                {
                                    found = false;
                                    break;
                                }
                                else
                                {
                                    found = true;
                                }
                            }
                            else
                            {
                                found = true;
                            }
                        }
                    }
                    else
                    {
                        if (i >= blackWord.Length - 1)
                        {
                            found = true;
                        }
                    }
                }
                else
                {
                    found = false;
                    break;
                }
            }

            nextCursor = nextCursor + 1 + offset;
            wordlenght++;
        }
        return found;
    }

    /// <summary>
    /// 查找并替换
    /// </summary>
    /// <param name="replaceChar"></param>
    public string Filter(char replaceChar)
    {
        cursor = 0;
        nextCursor = 0;
        LoadDictionary();
        if (sourctText != string.Empty)
        {
            //sourctText = sourctText.Replace("\n", "");
            //sourctText = sourctText.Trim();
            char[] tempString = sourctText.ToCharArray();
            for (int i = 0; i < SourceText.Length; i++)
            {
                //查询以该字为首字符的词组
                WordGroup group = MEMORYLEXICON[(int)ToDBC(SourceText)[i]];
                if (group != null)
                {
                    for (int z = 0; z < group.Count(); z++)
                    {
                        string word = group.GetWord(z);
                        if (word.Length == 0 || Check(word))
                        {
                            string blackword = string.Empty;
                            for (int pos = 0; pos < wordlenght + 1; pos++)
                            {
                                blackword += tempString[pos + cursor].ToString();
                                tempString[pos + cursor] = replaceChar;
                            }
                            illegalWords.Add(blackword);
                            cursor = cursor + wordlenght;
                            i = i + wordlenght;
                        }
                    }
                }
                cursor++;
            }
            return new string(tempString);
        }
        else
        {
            return string.Empty;
        }
    }
}

/// <summary>
/// 具有相同首字符的词组集合
/// </summary>
class WordGroup
{
    /// <summary>
    /// 集合
    /// </summary>
    private List<string> groupList;

    public WordGroup()
    {
        groupList = new List<string>();
    }

    /// <summary>
    /// 添加词
    /// </summary>
    /// <param name="word"></param>
    public void Add(string word)
    {
        groupList.Add(word);
    }

    /// <summary>
    /// 获取总数
    /// </summary>
    /// <returns></returns>
    public int Count()
    {
        return groupList.Count;
    }

    /// <summary>
    /// 根据下标获取词
    /// </summary>
    /// <param name="index"></param>
    /// <returns></returns>
    public string GetWord(int index)
    {
        return groupList[index];
    }
}

下面是抽出一个统一方法来调用检测

主要两个方法

1.检测是否有非法字符,返回bool

2.把非法字符转成*号,返回string

using System.Collections;
using System.Collections.Generic;
using UnityEngine;

public class SystemUtil
{
    /// <summary>
    /// 判断是否非法字符
    /// </summary>
    /// <param name="str"></param>
    /// <returns></returns>
    public static bool IsInvaild(string str)
    {
        string source = Filter(str);
        return str != source;
    }

    /// <summary>
    /// 把非法字符变成*号
    /// </summary>
    /// <param name="str"></param>
    /// <returns></returns>
    public static string Filter(string str)
    {
        filterWord.SourceText = str;
        return filterWord.Filter('*');
    }

    public static FilterWord filterWord
    {
        get
        {
            if (null == m_FilterWord)
            {
                m_FilterWord = new FilterWord();
            }
            return m_FilterWord;
        }
    }

    private static FilterWord m_FilterWord;
}

下面是工程下载地址

链接:https://pan.baidu.com/s/1x1RyEugV6N4D_Sj2_JgkUQ 
提取码:lvc3 

猜你喜欢

转载自blog.csdn.net/SnoopyNa2Co3/article/details/84934686