导读 | DFA即Deterministic Finite Automaton,也就是确定有穷自动机,它是是通过event和当前的state得到下一个state,即event+state=nextstate。在实现敏感词过滤的算法中,我们必须要减少运算,而DFA在DFA算法中几乎没有什么计算,有的只是状态的转换。 |
敏感词、文字过滤是一个网站必不可少的功能,如何设计一个好的、高效的过滤算法是非常有必要的。
在实现文字过滤的算法中,DFA是唯一比较好的实现算法。DFA即Deterministic Finite Automaton,也就是确定有穷自动机,它是是通过event和当前的state得到下一个state,即event+state=nextstate。在实现敏感词过滤的算法中,我们必须要减少运算,而DFA在DFA算法中几乎没有什么计算,有的只是状态的转换。
下面看下在c#方法下实现方式
1、构建敏感词库类
private bool LoadDictionary() { var wordList = new List(); if (_memoryLexicon == null) { _memoryLexicon = new WordGroup[char.MaxValue]; var words = new SensitiveWordBll().GetAllWords(); if (words == null) return false; foreach (string word in words) { wordList.Add(word); var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0); if (word != chineseWord) wordList.Add(chineseWord); } foreach (var word in wordList) { if (word.Length > 0) { var group = _memoryLexicon[word[0]]; if (group == null) { group = new WordGroup(); _memoryLexicon[word[0]] = group; } group.Add(word.Substring(1)); } } } return true; }
2、构建敏感词检测类
private bool Check(string blackWord) { _wordlenght = 0; //检测源下一位游标 _nextCursor = _cursor + 1; var found = false; var continueCheck = 0; //遍历词的每一位做匹配 for (var i = 0; i < blackword.length;="" i++)="" {="" 特殊字符偏移游标="" var="" offset="0;" if="" (_nextcursor="">= _sourceText.Length) { if (i - 1 < blackword.length="" -="" 1)="" found="false;" break;="" }="" else="" {="" 检测下位字符如果不是汉字="" 数字="" 字符="" 偏移量加1="" for="" (var="" y="_nextCursor;" y="">< _sourcetext.length;="" y++)="" {="" if="" (!ischs(_sourcetext[y])="" &&="" !isnum(_sourcetext[y])="" &&="" !isalphabet(_sourcetext[y]))="" {="" offset++;="" 避让特殊字符,下位游标如果="">=字符串长度 跳出 if (_nextCursor + offset >= _sourceText.Length) break; _wordlenght++; } else break; } if (_nextCursor + offset >= _sourceText.Length) { found = false; break; } if (blackWord[i] == _sourceText[_nextCursor + offset]) { found = true; continueCheck = 0; } else { // 匹配不到时尝试继续匹配4个字符 if (continueCheck < 4="" &&="" _nextcursor="">< _sourcetext.length="" -="" 1)="" {="" continuecheck++;="" i--;="" }="" else="" {="" found="false;" break;="" }="" }="" }="" _nextcursor="_nextCursor" +="" 1="" +="" offset;="" _wordlenght++;="" }="" return="" found;="" }="" }="">
3、测试与使用方法
_illegalWords = new List(); if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText)) { return sourceText; } if (!string.IsNullOrEmpty(sourceText)) _sourceText = sourceText; _cursor = 0; if (!LoadDictionary()) { return _sourceText; } var tempString = _sourceText.ToCharArray(); var sourceTextDbc = ToDBC(SourceText); for (var i = 0; i < sourcetext.length;="" i++)="" {="" 查询以该字为首字符的词组="" var="" group="_memoryLexicon[sourceTextDbc[i]];" if="" (group="" !="null)" {="" for="" (var="" z="0;" z="">< group.count();="" z++)="" {="" string="" word="group.GetWord(z);" if="" (word.length="=" 0="" ||="" check(word))="" {="" if="" (isfirstcheckedreturn)="" {="" return="" null;="" }="" var="" blackword="string.Empty;" for="" (var="" pos="0;" pos="">< _wordlenght="" +="" 1;="" pos++)="" {="" blackword="" +="tempString[pos" +="" _cursor].tostring();="" tempstring[pos="" +="" _cursor]="ReplaceChar;" }="" _illegalwords.add(blackword);="" _cursor="_cursor" +="" _wordlenght;="" i="i" +="" _wordlenght;="" break;="" }="" }="" }="" _cursor++;="" }="" return="" new="" string(tempstring);="" var="" filter="new" sensitivewordfilter();="" filter.sourcetext="dddddd" ;="" var="" sourcttext="filter.SourceText;" filter.resetmemorylexicon();="" var="" datetime="DateTime.Now;" var="" ss="filter.Filter();" var="" datetime2="DateTime.Now;" var="" millisecond="(datetime2" -="" datetime).totalmilliseconds;="" console.writeline(millisecond);="" console.writeline(ss);="" var="" words="System.IO.File.ReadAllLines(@" d:\recv\敏感词库大全.txt","="" system.text.encoding.utf8);="" var="" ssx="sourctText;" var="" datetimex="DateTime.Now;" foreach="" (var="" word="" in="" words)="" {="" if="" (word.length=""> 0) ssx = ssx.Replace(word, "*".PadLeft(word.Length, '*')); } var datetime2x = DateTime.Now; var millisecondx = (datetime2x - datetimex).TotalMilliseconds; Console.WriteLine(millisecondx); Console.WriteLine(ssx);