会员登录 - 用户注册 - 设为首页 - 加入收藏 - 网站地图 C# 敏感词过滤算法实现!

C# 敏感词过滤算法实现

时间:2025-11-03 17:10:19 来源:益强数据堂 作者:域名 阅读:702次

 

本文转载自微信公众号「UP技术控」,感词过滤作者conan 。算法实现转载本文请联系UP技术控公众号。感词过滤

敏感词、算法实现文字过滤是感词过滤一个网站必不可少的功能,如何设计一个好的算法实现、高效的感词过滤过滤算法是非常有必要的。

在实现文字过滤的算法实现算法中,高防服务器DFA是感词过滤唯一比较好的实现算法。DFA即Deterministic Finite Automaton,算法实现也就是感词过滤确定有穷自动机,它是算法实现是通过event和当前的state得到下一个state,即event+state=nextstate。感词过滤在实现敏感词过滤的算法实现算法中,我们必须要减少运算,WordPress模板感词过滤而DFA在DFA算法中几乎没有什么计算,有的只是状态的转换。

下面看下在c#方法下实现方式

1、构建敏感词库类

private bool LoadDictionary()        {            var wordList = new List<string>();            if (_memoryLexicon == null)            {                _memoryLexicon = new WordGroup[char.MaxValue];                var words = new SensitiveWordBll().GetAllWords();                if (words == null)                    return false;                foreach (string word in words)                {                    wordList.Add(word);                    var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word,                        Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);                    if (word != chineseWord)                        wordList.Add(chineseWord);                }                foreach (var word in wordList)                {                    if (word.Length > 0)                    {                        var group = _memoryLexicon[word[0]];                        if (group == null)                        {                            group = new WordGroup();                            _memoryLexicon[word[0]] = group;                        }                        group.Add(word.Substring(1));                    }                }            }            return true;        } 

2、构建敏感词检测类

private bool Check(string blackWord)      {          _wordlenght = 0;          //检测源下一位游标          _nextCursor = _cursor + 1;          var found = false;          var continueCheck = 0;          //遍历词的每一位做匹配          for (var i = 0; i < blackWord.Length; i++)          {              //特殊字符偏移游标              var offset = 0;              if (_nextCursor >= _sourceText.Length)              {                  if (i - 1 < blackWord.Length - 1)                      found = false;                  break;              }              else              {                  //检测下位字符如果不是汉字 数字 字符 偏移量加1                  for (var y = _nextCursor; y < _sourceText.Length; y++)                  {                      if (!IsChs(_sourceText[y]) && !IsNum(_sourceText[y]) && !IsAlphabet(_sourceText[y]))                      {                          offset++;                          //避让特殊字符,下位游标如果>=字符串长度 跳出                          if (_nextCursor + offset >= _sourceText.Length)                              break;                          _wordlenght++;                      }                      else break;                  }                  if (_nextCursor + offset >= _sourceText.Length)                  {                      found = false;                      break;                  }                  if (blackWord[i] == _sourceText[_nextCursor + offset])                  {                      found = true;                      continueCheck = 0;                  }                  else                  {                      // 匹配不到时尝试继续匹配4个字符                      if (continueCheck < 4 && _nextCursor < _sourceText.Length - 1)                      {                          continueCheck++;                          i--;                      }                      else                      {                          found = false;                          break;                      }                  }              }              _nextCursor = _nextCursor + 1 + offset;              _wordlenght++;          }          return found;      }  } 

3、测试与使用方法

_illegalWords = new List<string>();           if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText))           {               return sourceText;           }           if (!string.IsNullOrEmpty(sourceText))               _sourceText = sourceText;           _cursor = 0;           if (!LoadDictionary())           {               return _sourceText;           }           var tempString = _sourceText.ToCharArray();           var sourceTextDbc = ToDBC(SourceText);           for (var i = 0; i < SourceText.Length; i++)           {               //查询以该字为首字符的词组               var group = _memoryLexicon[sourceTextDbc[i]];               if (group != null)               {                   for (var z = 0; z < group.Count(); z++)                   {                       string word = group.GetWord(z);                       if (word.Length == 0 || Check(word))                       {                           if (isFirstCheckedReturn)                           {                               return null;                           }                           var blackword = string.Empty;                           for (var pos = 0; pos < _wordlenght + 1; pos++)                           {                               blackword += tempString[pos + _cursor].ToString();                               tempString[pos + _cursor] = ReplaceChar;                           }                           _illegalWords.Add(blackword);                           _cursor = _cursor + _wordlenght;                           i = i + _wordlenght;                           break;                       }                   }               }               _cursor++;           }           return new string(tempString);  var filter = new SensitiveWordFilter();            filter.SourceText = "dddddd";            var sourctText = filter.SourceText;            filter.ResetMemoryLexicon();            var datetime = DateTime.Now;            var ss = filter.Filter();            var datetime2 = DateTime.Now;            var millisecond = (datetime2 - datetime).TotalMilliseconds;            Console.WriteLine(millisecond);            Console.WriteLine(ss);            var words = System.IO.File.ReadAllLines(@"D:\Recv\敏感词库大全.txt", System.Text.Encoding.UTF8);            var ssx = sourctText;            var datetimex = DateTime.Now;            foreach (var word in words)            {                if (word.Length > 0)                    ssx = ssx.Replace(word, "*".PadLeft(word.Length, *));            }            var datetime2x = DateTime.Now;            var millisecondx = (datetime2x - datetimex).TotalMilliseconds;            Console.WriteLine(millisecondx);            Console.WriteLine(ssx); 

(责任编辑:IT科技)

推荐内容
  • 莱斯硬盘分区教程(详解莱斯硬盘分区方法,让你的数据有序有力)
  • 联想XP系统安装教程(让您轻松安装联想XP系统,享受流畅操作体验)
  • 三洋滚筒衣机的高性能与先进科技(品质卓越,让您的衣物焕然一新)
  • 奇酷手机配置详解(性能、摄影和设计,一触即发)
  • 光驱安装XP系统教程(详细教你如何使用光驱安装XP系统)
  • U精灵装XP系统教程(无需光盘,轻松一键安装XP系统)